def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(assembly, rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[assembly] = doc[assembly][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (_doc['rsid'], _doc['_id'])) else: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 cnt_1 += 1 logging.info("Done. [{}]".format(timesofar(t0))) logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format( cnt_1, cnt_2, cnt_3))
def assertVcfHasVariantWithCall(self, vcf, chrom, pos, sample, call): """ Assert that a call is made for a given sample in a given position. `call` is a dict corresponding to elements in the vcf sample field. Example: self.assertVcfHasVariantWithCall(my_vcf, 1, 3184885, 'B', call={'GT': '1/2', 'DP': 10}) """ self.assertVcfHasSample(vcf, sample) v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: for cc in variant.samples: if cc.sample == sample: # thank you http://stackoverflow.com/a/4527978/179444 shared_items = set(cc.data.__dict__.items()) & set( call.items()) if shared_items == set(call.items()): variant_found = True if not variant_found: raise AssertionError( "Call {} not present for sample {} at {}:{} in {}".format( call, sample, chrom, pos, vcf))
def get_haplotype_stats(template_vcf: vcf.Reader, in_vcf: vcf.Reader, out): contigs = in_vcf.contigs.keys() hap_stats = HapStats() for contig in contigs: try: template_vcf.fetch(contig) template_chromo = ChromosomoHaplotype(template_vcf, contig) in_chromo = ChromosomoHaplotype(in_vcf, contig) chromo_hap_stats = get_haplotype_stats_chromo( template_chromo, in_chromo, out, contig) hap_stats.insert_hap_stats(chromo_hap_stats) except: continue out.write("%s\t%d\t%d\t%d\t%d\t%.8f\t%.8f\n" % ("total", hap_stats.get_AN50(), hap_stats.get_N50(), hap_stats.get_total_phased(), hap_stats.get_total_spanned(), hap_stats.get_switch_error(), hap_stats.get_mismatch_error()))
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[POS_KEY] = doc[POS_KEY][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: print(_doc['rsid'], '\t', _doc['_id']) else: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 cnt_1 += 1 print("Done. [{}]".format(timesofar(t0))) print("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
def write_chromosome(in_vcf: vcf.Reader, out_vcf: vcf.Writer, chromo_haplotype: ChromosomoHaplotype, contig: str): rec: vcf.model._Record for rec in in_vcf.fetch(contig): het = rec.samples[0].gt_type if het != 1: # not het loci out_vcf.write_record(rec) else: record = chromo_haplotype.chromo_record[rec.POS] record.finalize_record(rec) out_vcf.write_record(rec)
def assertVcfHasVariantAt(self, vcf, chrom, pos): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: variant_found = True if not variant_found: raise AssertionError("Variant at {}:{} not present in {}".format( chrom, pos, vcf))
def assertVcfHasVariantWithChromPosRefAlt(self, vcf, chrom, pos, ref, alt): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and \ variant.POS == pos and \ variant.REF == ref and \ alt in variant.ALT: variant_found = True if not variant_found: raise AssertionError( "Variant at {}:{} {}/{} not present in {}".format( chrom, pos, ref, alt, vcf))
def __init__(self, in_vcf: vcf.Reader, chromo: str): self.chromo_record = dict() self.chromo_phase_set = dict() self.chromo_record2phaseset_map = dict() self.graph_struct = graph.Graph() rec: vcf.model._Record ps_label_fix = dict() idx = 0 for rec in in_vcf.fetch(chromo): het = rec.samples[0].gt_type if het != 1: # not het loci continue PS_fix = 0 if rec.samples[0].phased: fmt = rec.FORMAT.split(':') if 'PS' in fmt: PS = rec.samples[0]['PS'] if PS in ps_label_fix.keys(): PS_fix = ps_label_fix[PS] else: ps_label_fix[PS] = rec.POS PS_fix = rec.POS else: PS_fix = 1 record = Record() record.copy_from_rec(rec, PS_fix, idx) idx += 1 self.chromo_record[record.pos] = record if record.ps != 0: PS = record.ps self.chromo_record2phaseset_map[record.pos] = PS phase_set: PhaseSet if PS in self.chromo_phase_set.keys(): phase_set = self.chromo_phase_set[PS] else: phase_set = PhaseSet(record.ps) self.chromo_phase_set[PS] = phase_set phase_set.insert_record(record)
class IlluminaWriter(object): '''It writes the SNPs in Illumina format ref_fpath should be in fasta format and it has to have a name attribute. min_maf controls the SNPs reported in the adjacent segments as IUPAC codes. ''' # TODO add extra error classes # TODO include the error classes inside this class to easy access class NotEnoughAdjacentSequenceError(Exception): pass def __init__(self, ref_fpath, out_fhand, length=60, vcf_fpath=None, min_length=None): ''''It inits. The vcf will be used to replace in the reference sequence the SNPs around the SNP of interest with IUPAC codes ''' self._sep = u'\t' self._len = length if min_length is None: min_length = length if min_length > length: msg = 'Minimum length must be smaller than required length' raise ValueError(msg) self._min_len = min_length self._ref_seqs = SeqIO.index(ref_fpath, format='fasta') if vcf_fpath: self._snvs = Reader(filename=vcf_fpath) else: self._snvs = None self._out_fhand = out_fhand out_fhand.write(u'CHROM\tPOS\tID\tseq\n') self._prev_chrom = None def write(self, snv): chrom_name = snv.CHROM prev_chrom = self._prev_chrom if prev_chrom is None or prev_chrom.name != chrom_name: chrom = self._ref_seqs[chrom_name] self._prev_chrom = chrom else: chrom = prev_chrom length = self._len min_len = self._min_len snv_start = snv.start # 0 based snv_end = snv.end # 1 based desired_start = snv_start - length # desired segment start end = snv_end + length # desired segment end chrom_seq = chrom.seq first_segment = unicode(chrom_seq[desired_start:snv_start]) if len(first_segment) < min_len: msg = "Not enough sequence in 3'. ID: %s, POS: %d, CHROM: %s" msg %= (snv.ID, snv.POS, snv.CHROM) raise self.NotEnoughAdjacentSequenceError(msg) if self._snvs: real_start = snv_start - len(first_segment) close_snvs = self._snvs.fetch(chrom.name, start=real_start, end=snv_start) first_segment = _replace_snvs_with_iupac(first_segment, close_snvs, seq_offset=real_start) snv_segment = _build_snv_section(snv) second_segment = unicode(chrom_seq[snv_end:end]) if len(second_segment) < min_len: msg = "Not enough sequence in 5'. ID: %s, POS: %d, CHROM: %s" msg %= (snv.ID, snv.POS, snv.CHROM) raise self.NotEnoughAdjacentSequenceError(msg) if self._snvs: real_end = snv_end + len(second_segment) close_snvs = self._snvs.fetch(chrom.name, start=snv_end, end=real_end) second_segment = _replace_snvs_with_iupac(second_segment, close_snvs, seq_offset=snv_end) out_fhand = self._out_fhand sep = self._sep out_fhand.write(unicode(snv.CHROM)) out_fhand.write(sep) out_fhand.write(unicode(snv.POS)) out_fhand.write(sep) snp_id = snv.ID if snp_id is None: snp_id = u'.' out_fhand.write(snp_id) out_fhand.write(sep) out_fhand.write(first_segment) out_fhand.write(snv_segment) out_fhand.write(second_segment) out_fhand.write(u'\n') def flush(self): self._out_fhand.flush() def close(self): self._out_fhand.close()