def vcf_to_vca(self,vcf_path): vca = [] try: vcfr = ht.VCF_Reader(vcf_path) for vc in vcfr: vca += [vc] except Exception as E: pass return vca
def vcf_glob_to_svultd(path_glob,chroms,offset_map,flt=0,flt_exclude=[]): vcfs,S,V = glob.glob(path_glob),{},{} for vcf in vcfs: vcr = ht.VCF_Reader(vcf) s_id = id_trim(vcf) if s_id in flt_exclude: S[s_id],V[s_id] = construct_svult(vcr,chroms,offset_map,s_id,-1) else: S[s_id],V[s_id] = construct_svult(vcr,chroms,offset_map,s_id,flt) return S,V
def parse_vcf(vcf_file, snp_data, min_reads, min_af, min_qual, annotations, seqs, options, line_num=100000): """ Parse VCF file counts synonymous and non-synonymous SNPs :param file vcf_file: file handle to a VCF file :param dict snp_data: dictionary from :func:`init_count_set` with per sample SNPs information :param int min_reads: minimum number of reads to accept a SNP :param float min_af: minimum allele frequency to accept a SNP :param int min_qual: minimum quality (Phred score) to accept a SNP :param dict annotations: annotations grouped by their reference sequence :param dict seqs: reference sequences :param int line_num: the interval in number of lines at which progress will be printed """ vcf_handle = HTSeq.VCF_Reader(compressed_handle(vcf_file)) vcf_handle.parse_meta() vcf_handle.make_info_dict() # total number of SNPs accepted count_tot = 0 # number of SNPs skipped for low depth skip_dp = 0 # number of SNPs skipped for low allele frequency skip_af = 0 # number of SNPs skipped for low quality skip_qual = 0 # indels skip_indels = 0 for vcf_record in vcf_handle: # the SNP is a sequence with no annotations if vcf_record.chrom not in annotations: continue if float(vcf_record.qual) < min_qual: # low quality SNP skip_qual += 1 continue # unpack info records (needed for vcf_record.info to be a dictionary) vcf_record.unpack_info(vcf_handle.infodict) if vcf_record.info['INDEL']: skip_indels += 1 continue if not isinstance(vcf_record.info['DP'], int): LOG.warning(vcf_record.info['DP']) if vcf_record.info['DP'] < min_reads: # not enough reads (depth) for the SNP skip_dp += 1 continue # Samtools mpileup -> bcftools call doesn't output the allele freq. # it can be calculated with AC/AN for each ALT nucleotide # checked on bfctools (roh command) manual # https://samtools.github.io/bcftools/bcftools.html try: allele_freqs = vcf_record.info['AF'] except KeyError: if isinstance(vcf_record.info['AC'], list): allele_freqs = [ AC / vcf_record.info['AN'] for AC in vcf_record.info['AC'] ] else: allele_freqs = vcf_record.info['AC'] / vcf_record.info['AN'] # if the allele frequency is a single value, make it a list, so # the iteration below works anyway if isinstance(allele_freqs, float): allele_freqs = [allele_freqs] # alt is the nucleotidic change iter_data = zip(allele_freqs, vcf_record.alt) for alt_index, (allele_freq, change) in enumerate(iter_data): if allele_freq < min_af: # the allele frequency for the SNP is too low, it'll be # skipped skip_af += 1 continue # the samples that contain the SNP is a string separated by '-' if options.bcftools_vcf: samples = set() for sample_id, sample_info in vcf_record.samples.items(): # prepare the genotype list, to make the comparison easier # the genotype separator to '/' only, to use only one # type of split sample_info_gt = sample_info['GT'].replace('|', '/') sample_info_gt = sample_info_gt.split('/') for genotype in sample_info_gt: if genotype == '.': continue if int(genotype) == (alt_index + 1): samples.add(sample_id) else: samples = [ sample for sample in vcf_record.info['set'].split('-') ] check_snp_in_set(samples, snp_data, vcf_record.pos.start, change, annotations[vcf_record.chrom], seqs[vcf_record.chrom]) # increase the total number of snps available count_tot += 1 if vcf_handle.line_no % line_num == 0: LOG.info( "Line %d, SNPs passed %d; skipped for: qual %d, " + "depth %d, freq %d, indels %d", vcf_handle.line_no, count_tot, skip_qual, skip_dp, skip_af, skip_indels)
for feature in gff_file: if feature.type == "transcript": transcript[feature.name] = { 'iv': feature.iv, # .iv is GenomicInterval 'CDSfeats': [] } if feature.type == "CDS": transcript[feature.attr["Parent"]]['CDSfeats'].append(feature.iv) ## Future worry: do I need CDS.frame in transcript object? CDSfeat[feature.iv] = feature print( "# Chrom\tPos\tPos in CDS\tBase change\tAA change\tAA pos in transcript\ttranscript ID" ) vcfr = HTSeq.VCF_Reader(sys.argv[3]) for vc in vcfr: vCDS = CDSfeat[vc.pos] # vCDS.iv.start is base before 1st base of CDS if not vCDS == None and not vc.pos.start == vCDS.iv.start: vTranscript = transcript[vCDS.attr["Parent"]] refseq = str( HTSeq.Sequence( sequences[vCDS.iv.chrom].seq[vCDS.iv.start:vCDS.iv.end])) refseqT = ''.join( str(HTSeq.Sequence(sequences[CDS.chrom].seq[CDS.start:CDS.end])) for CDS in vTranscript['CDSfeats']) relpos = vc.pos.start - vCDS.iv.start # if variant is 1st base of CDS, relpos=1 if refseq[relpos - 1] != vc.ref:
def __iter__(self): self.mc.log_debug('vcf_path: {}'.format(self.vcf_path)) self.mc.log_debug('sample: {}'.format(self.sample)) self.mc.log_debug('ploidy: {}'.format(self.ploidy)) self.mc.log_debug('add_chrom_prefix: {}'.format(self.add_chrom_prefix)) vcf = HTSeq.VCF_Reader(self.vcf_path) vcf.parse_meta() self.mc.handle_progress('Reading VCF file...') n = -1 for vc in vcf: n += 1 if n != 0 and n % 500000 == 0: self.mc.handle_progress( '{} lines read from VCF file...'.format(n)) if self.sample not in vc.samples: raise AnnotationParseError( self.vcf_path, 'Sample "{}" not in VCF file.'.format(self.sample)) gt = vc.samples[self.sample]['GT'] if '.' in gt: continue if '/' in gt: phased = False if '|' in gt: gt = gt.replace('|', '/') sep = '/' else: assert '|' in gt phased = True sep = '|' gt = gt.split(sep) if len(gt) != self.ploidy: raise AnnotationParseError( self.vcf_path, 'The ploidy({}) may be inconsistent with the ' 'sample "{}"({}).'.format(self.ploidy, self.sample, len(gt))) ref_alt = [vc.ref] + vc.alt alleles = [ref_alt[int(g)] for g in gt] for allele in alleles: if len(allele) != 1: continue if len(set(alleles)) < 2: continue chrom = vc.pos.chrom if self.add_chrom_prefix: chrom = 'chr{}'.format(chrom) pos = vc.pos.pos - 1 snp = SNP(chrom, pos, alleles, phased) assert self.ploidy == snp.ploidy yield snp
) sys.exit() try: bool_keepSDCO = True str_vcfName = sys.argv[1] parent1 = sys.argv[2] parent2 = sys.argv[3] GENOTYPEQUALITYTHRESHOLD = float(sys.argv[4]) MISSINGNESSTHRESHOLD = float(sys.argv[5]) sys.stderr.write("\tvcfToRqtl\n\tMissingness threshold: %s\n" % MISSINGNESSTHRESHOLD) sys.stderr.write("\tGenotype Quality threshold: %s\n" % GENOTYPEQUALITYTHRESHOLD) vcfFile = HTSeq.VCF_Reader(str_vcfName) if sys.argv[6] == "--removeSDCO": sys.stderr.write("\tWill remove short range double crossovers\n") bool_keepSDCO = False elif sys.argv[6] == "--keepSDCO": sys.stderr.write("\tWill keep short range double crossovers\n") bool_keepSDCO = True else: usage() except IndexError: # Check if arguments were given sys.stderr.write( "Insufficient arguments received. Please check your input:\n") usage() except IOError: # Check if file is unabled to be opened. sys.stderr.write("Cannot open target file. Please check your input:\n") usage()
fh.write('\n') fh.close() return out_geno, mID_lookup if __name__ == "__main__": #vcfn,qd,gq,chi2crit = sys.argv[1:] vcfn, outbase, gq, fract_max = sys.argv[1:5] gq = float(gq) fract_max = float(fract_max) #outbase = os.path.splitext(vcfn)[0] vcfr = HTSeq.VCF_Reader(vcfn) vcfr.parse_meta() vcfr.make_info_dict() ped, recombinants, parents, parents_spp = sample_data_from_DB( vcfr.sampleids) tests = species_tests_by_family(ped, recombinants, parents_spp) polarized_loci, polarized_geno = cross_genotypes_from_htseq_vcf(vcfr, tests, gq_cut=gq) loc_counts = dict([ (loc, sum([polarized_geno[ind].has_key(loc) for ind in recombinants])) for loc in polarized_loci ]) mct = max(loc_counts.values())