def select(self, jar, outvcf, ref, snp=False, pass_only=False, indel=False): """ parse VCF to get only sites passed quality control Parameters ---------- jar: str GATK jar path prefix: str output file prefix outvcf: output vcf """ if snp and indel: raise ValueError("Cannot select both SNPs and InDels") if not any([snp, pass_only, indel]): raise ValueError("At least select one type of variants") cmd = ' '.join([ 'java -jar ', jar, '-T SelectVariants', '-R', ref, '-V', self._vcf, '-o', outvcf ]) cmd += ' -ef' if pass_only else '' cmd += ' -selectType SNP' if snp else '' cmd += ' -selectType INDEL' if indel else '' run(cmd) return self
def cal_miss(self, name='miss'): """ Calculate missingness of all sites""" # TO DO self.get_plink() run('plink --bfile '+self._plink+' --missing --allow-extra-chr --out ' +self._prefix) return self
def import_vcf(self, info=['AF', 'AN', 'AC']): """ Import info from a VCF Description ----------- get vcf and AF and missingness from a VCF Caveat: This module assumes the VCF's coming from GATK, with AF as the field for allele frequencies, and AC for Allele Count, and AN for Allelic Number. Parameters ---------- VCF: str input VCF file path info: list A list that contains names of infor field of interest """ header = ['CHR', 'ID'] + info query_string = '\'%CHROM\t%CHROM-%POS-%REF-%ALT{0}\t' query_string += '\t'.join([('%'+i) for i in info])+'\'' cmd = ' '.join([ "bcftools query -f ", query_string, self._vcf, '>', self._tsv]) run(cmd) self._df = pd.read_csv(out_tsv, sep='\t', header=None, names=header) return self
def get_plink(self): """ Get plink format files """ cmd = ' '.join('plink --vcf', self._vcf, '--allow-extra-chr', '--out', self._prefix) run(cmd) self._plink = self._prefix return self
def pilon(fa, bam, prefix, ram, threads, jar): """ Run pilon commands Parameters ---------- fa: :obj:`str` fasta file bam: :obj:`str` input bam path prefix: :obj:`str` output prefix ram: :obj:`int` input ram threads: :obj:`int` threads for pilon outdir: :obj:`str` output directory Returns ------- """ cmd = ' '.join([ 'java -Xmx'+str(ram)+'g', '-jar', jar, '--genome', fa, '--frags', bam, '--output', prefix, '--threads', str(threads), '--vcf --changes --tracks --verbose > '+prefix+'.pilon.log 2>&1']) run(cmd) return cmd
def combine_var(self, vcf_dict, option, priority=None): ''' :param vcf_dict: dictionary of vcf files, with key abbreviation of each vcf :param prefix: output prefix :param option: merging options :param priority: ''' out_vcf = self.prefix + '.vcf.gz' options = ['UNIQUIFY', 'PRIORITIZE', 'UNSORTED'] if option not in options: raise ValueError('Merge option not valid.\n') if option == 'PRIORITIZE' and priority is None: raise ValueError('Need to specify priority.\n') if option == 'UNSORTED': option += ' --assumeIdenticalSamples' cmd = ' '.join([ self.cmd, '-T CombineVariants', '-genotypeMergeOptions', option, '-O', out_vcf ]) for name, vcf in vcf_dict.items(): if option == 'PRIORITIZE': cmd += ' --variant:' + name + ' ' + vcf else: cmd += ' --variant ' + vcf run(cmd) return out_vcf
def fa2phylip(fa, output, jar): ''' transfer fasta file to phylip with java tool readSeq :param fa: fasta file :param jar: path to readseq.jar :param out_prefix: ''' cmd = ' '.join(['java -cp', jar, 'run -f 12', fa]) run(cmd) return output
def process_pilon_out(log, outdir, prefix): """ process pilon output log: logfile outdir: output directory """ cmd = ' '.join( ['pilon_metrics', '-d', outdir, '-l', log, '--out_prefix', prefix]) run(cmd) return cmd
def FastTreeDP(in_fa, out_prefix): ''' perform fastaTreeDP analysis :param in_fa: input fasta file :param out_prefix: output file prefix :returns nwk file ''' out_nwk = out_prefix+'.nwk' cmd = 'FastTreeDP -nt '+in_fa+' > ' + out_nwk run(cmd) return out_nwk
def fasttree(fa, prefix): ''' Run FastTreeDP :param fa: fasta file :param prefix: output prefix ''' cmd = ' '.join([ 'FastTreeDP -nt', fa, '>', out_prefix+'.nwk' ]) run(cmd) return prefix+'.nwk'
def tabix(file, type=None): """ Index tabix file :param file: input file :param type: file type, vcf """ cmd = 'tabix '+file if type: cmd += ' -p '+type run(cmd) return file+'.tbi'
def cal_dos(self, haploid=True): """ Get a genotype dosage matrix from the VCF """ dos_file = self._prefix+'.dos.tsv' if haploid: run("bcftools query -f '[%GT ]\\n' " + self._vcf + '>' + dos_file) self._dosage_matrix = pd.read_csv( dos_file, sep=r'\s+', header=None, na_values='.') else: raise ValueError("Not yet support polyploid.") return self
def ramxl(phylip, output, threads): ''' Run RAaML :param phylip: input phylip format file :param output: output file name :param threads: number of threads used for ''' cmd = ' '.join([ 'raxmlHPC-PTHREADS-SSE3 -p 78960 -f a -x 12345 -N 1000 -m GTRCAT', '-T', str(threads), '-n', output, '-s', phylip]) run(cmd) return output
def get_info(self, info=['AF']): """ Get variant site level info of interest """ header = ['CHR', 'ID'] + info query_string = '\'%CHROM\t%CHROM-%POS-%REF-%ALT{0}\t' query_string += '\t'.join([('%'+i) for i in info])+'\'' cmd = ' '.join([ "bcftools query -f ", query_string, self._vcf, '>', self._site_info_tsv]) run(cmd) self._site_info = pd.read_csv(self._site_info_tsv, sep='\t', header=None, names=header)
def filterGatkGenotypes(vcf, out_prefix): """ filter Gatk output vcf :param vcf: input vcf file :param out_prefix: output prefix """ outfile = out_prefix+'_GQ50_AD08_DP10.vcf' cmd = ' '.join([ 'filterGatkGenotypes.py --min_GQ 50 --min_percent_alt_in_AD 0.8', '--min_total_DP 10', vcf, '>', outfile ]) run(cmd) return outfile
def create_snpeff_db(gff3, dir, genome, config, prefix, ram, jar, ref_fa): """ Create snpEff database gff3: gff file of gene annotation genome: name of the reference genome config: snpEff config files prefix: output Prefix ram: RAM in GB jar: snpEff jar ref_fa: reference fasta file """ run(' '.join(['snpeff_db.sh', dir, jar, genome, ref_fa, gff3, ram])) return cmd
def vcf_snp_to_fasta(invcf, prefix, max_amb=10): ''' snp only vcf to fasta file :param invcf: input vcf file :param prefix: output file prefix :param max_amb: maximum number of samples with ambiguous calls for a site to be included, recommended number of samples 10%, use a very large number to disable this function 100000 ( legacy options and will not be maintained.) ''' cmd = ' '.join(['vcfSnpsToFasta.py --max_amb_samples', max_amb, invcf, '>', prefix+'.fasta']) run(cmd) return prefix+'.fasta'
def filter_variants(invcf, outvcf, min_GQ=50, AD=0.8, DP=10): """ apply variant filtering using GQ, AD and DP :param invcf: input vcf :param outvcf: output vcf :param min_GQ: minimum GQ cutoff :param AD: allelic depth cutoff :param DP: depth cutoff """ cmd = ' '.join(['filterGatkGenotypes.py', '--min_GQ', str(min_GQ), '--min_percent_alt_in_AD', str(AD), '--min_total_DP', str(DP), invcf, '>', outvcf]) run(cmd) return outvcf
def genotype_concordance(self, comp, eval, hap=False): ''' comppare :param comp: VCF file for comparison :parma eval: VCF file for evaluation :param out: output evaluation results :param hap: whether input is haploid VCF ''' out = self.out_dir + '/' + self.prefix + '.txt' cmd = ' '.join([ self.cmd, '-T GenotypeConcordance', '--comp', comp, '--eval', eval, '--out', out ]) run(cmd) return out
def snpeff_db(gff3, dir, genome, config, prefix, ram, jar, ref_fa): """ Create snpEff database gff3: gff file of gene annotation genome: name of the reference genome config: snpEff config files prefix: output Prefix ram: RAM in GB jar: snpEff jar ref_fa: reference fasta file """ snpeff_dir = os.path.dirname(jar) cmd = ' '.join(['sh snpeff_db.sh', dir, snpeff_dir, genome, ref_fa, gff3, ram]) run(cmd) return cmd
def snpeff_annot(self, jar, config, genome, ram): """ run SNPEFF on a vcf invcf: input vcf outvcf: output vcf jar: snpeff jar genome: tag of genome name ram: memory in GB config: configuration file """ self.ann_vcf = os.path.basename(self._vcf).replace('vcf', 'snpeff.vcf') run(' '.join([ 'java -Xmx'+str(ram)+'g', '-jar', jar, 'ann', '-v', '-c', config, '-i vcf -o vcf', genome, self._vcf, '| bgzip >', self.ann_vcf])) return self
def select_var(self, in_vcf, xl=None, il=None): ''' select variants :param in_vcf: input vcf :param xl: intervals to exclude :param il: intervals to include ''' output = self.prefix + '.vcf.gz' cmd = ' '.join([ self.cmd, '-T SelectVariants', '--variant', in_vcf, '-o ', output ]) if xl is not None: cmd += '-XL ' + xl if il is not None: cmd += '-L ' + il run(cmd) return output
def coverage_barplot(cov_tsv, prefix, color_csv, legacy, no_sub): """ Generate coverage barplot :param cov_tsv: coverage profile list, first :param prefix: output prefix :param legacy: if output tsv in legacy mode, compatible with matlab code :param no_sub: boolean, whether input genome has a subgenome """ cmd = ' '.join(['coverage_barplot.R', '-i', cov_tsv, '-p', prefix, '-c', color_csv]) if legacy: cmd += ' -l' if no_sub: cmd += ' --nosub' run(cmd) print(' - Finish generating coverage plot.') return 1
def snpeff(invcf, outvcf, jar, config, genome, ram): """ run SNPEFF on a vcf invcf: input vcf outvcf: output vcf jar: snpeff jar genome: tag of genome name ram: memory in GB config: configuration file """ cmd = ' '.join([ 'java -Xmx'+str(ram)+'g', '-jar', jar, 'eff', '-v', '-c', config, '-onlyCoding False', '-i vcf -o vcf', genome, invcf, '>', outvcf]) run(cmd) return cmd
def variant_eval(self, vcf, titv=True, samp=True, indel=True, multi=True): ''' VCF sample QC by different stratifications :param vcf: input vcf :param titv: use TiTv Evaluator :param indel: use InDel Evaluator :param multi: summarize multiallelic sites :param samp: stratify by samples ''' out = os.path.join(self.out_dir, self.prefix + '.eval') cmd = ' '.join([ self.cmd, '-T VariantEval', '--eval', vcf, '-o', out, '-noEV -noST -EV CountVariants' ]) if titv: cmd += ' -EV TiTvVariantEvaluator' if samp: cmd += ' -ST Sample' if indel: cmd += ' -EV IndelSummary' if multi: cmd += ' -EV MultiallelicSummary' run(cmd) return out
def import_snpeff(self, snpeff_tsv=None): if snpeff_tsv is None: info_fields = [ 'AF', 'AN', 'AC', 'SNPEFF_AMINO_ACID_CHANGE', 'SNPEFF_CODON_CHANGE', 'SNPEFF_EFFECT', 'SNPEFF_EXON_ID', 'SNPEFF_FUNCTIONAL_CLASS', 'SNPEFF_GENE_BIOTYPE', 'SNPEFF_GENE_NAME', 'SNPEFF_IMPACT', 'SNPEFF_TRANSCRIPT_ID' ] snpeff_tsv = self._prefix+'.snpeff.tsv' query = ('\'%CHROM\t%POS\t%REF\t%ALT\t' + '\t'.join(['%INFO/'+i for i in info_fields]) + '\n\'') run('bcftools query -f {} '.format(query) + self._vcf+'> '+snpeff_tsv) self._site_info = pd.read_csv( snpeff_tsv, sep='\t', header=None, names=['CHR', 'POS', 'REF', 'ALT']+info_fields) return self
def af(): """ get allele frequencies using vcftools """ run("vcftools --gzvcf "+self._vcf + " --freq2 --out tmp") self._af = pd.read_csv('tmp.frq', sep='\t', header=0) rm('tmp.frq') return self