def test_illumina_writer(self): # 01234 # 1234567890 # CCTGATTT-A # TAACGA # - C -A vcf = '''##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO ref 1 . C T 10 PASS . ref 2 . CT CA,C 10 PASS . ref 3 . T A 10 PASS . ref 4 . G C 10 PASS . ref 5 . A G 10 PASS . ref 6 . T A,C 10 PASS . ref 7 . TT T 10 PASS . ref 8 . T TA 10 PASS . ref 10 . A C 10 PASS . ''' vcf = vcf.replace(' ', '\t') vcf_fhand = NamedTemporaryFile(suffix='.vcf') vcf_fhand.write(vcf) vcf_fhand.flush() vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz') compress_with_bgzip(vcf_fhand, vcf_compressed) index_vcf_with_tabix(vcf_compressed.name) ref_fhand = NamedTemporaryFile(suffix='.fasta') ref_fhand.write('>ref\nACTGATTTA\n') ref_fhand.flush() out_fhand1 = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand1, min_length=0, vcf_fpath=vcf_compressed.name) for snp in Reader(filename=vcf_compressed.name): writer.write(snp) # With no SNPs converted to IUPAC around out_fhand2 = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand2, min_length=0) for snp in Reader(filename=vcf_compressed.name): writer.write(snp) remove(vcf_compressed.name + '.tbi') expected = u'CHROM\tPOS\tID\tseq\n' expected += u'ref\t1\t.\t[C/T]*WSRHT-^A\n' expected += u'ref\t2\t.\tY[CT/CA/C]SRHT-^A\n' expected += u'ref\t3\t.\tYC[T/A]SRHT-^A\n' expected += u'ref\t4\t.\tY*W[G/C]RHT-^A\n' expected += u'ref\t5\t.\tY*WS[A/G]HT-^A\n' expected += u'ref\t6\t.\tY*WSR[T/A/C]T-^A\n' expected += u'ref\t7\t.\tY*WSRH[TT/T]A\n' expected += u'ref\t8\t.\tY*WSRHT[T/TA]A\n' expected += u'ref\t10\t.\tY*WSRHT-^A[A/C]\n' assert expected == out_fhand1.getvalue() expected = u'CHROM\tPOS\tID\tseq\nref\t1\t.\t[C/T]' assert expected in out_fhand1.getvalue()
def assertVcfHasVariantWithCall(self, vcf, chrom, pos, sample, call): """ Assert that a call is made for a given sample in a given position. `call` is a dict corresponding to elements in the vcf sample field. Example: self.assertVcfHasVariantWithCall(my_vcf, 1, 3184885, 'B', call={'GT': '1/2', 'DP': 10}) """ self.assertVcfHasSample(vcf, sample) v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: for cc in variant.samples: if cc.sample == sample: # thank you http://stackoverflow.com/a/4527978/179444 shared_items = set(cc.data.__dict__.items()) & set( call.items()) if shared_items == set(call.items()): variant_found = True if not variant_found: raise AssertionError( "Call {} not present for sample {} at {}:{} in {}".format( call, sample, chrom, pos, vcf))
def assertVcfHasVariantAt(self, vcf, chrom, pos): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and variant.POS == pos: variant_found = True if not variant_found: raise AssertionError("Variant at {}:{} not present in {}".format( chrom, pos, vcf))
def parse_vcf(assembly, vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(assembly, rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[assembly] = doc[assembly][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (_doc['rsid'], _doc['_id'])) else: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: logging.info("%s\t%s" % (doc['rsid'], doc['_id'])) else: cnt_3 += 1 cnt_1 += 1 logging.info("Done. [{}]".format(timesofar(t0))) logging.info("Total rs: {}; total docs: {}; skipped rs: {}".format( cnt_1, cnt_2, cnt_3))
def assertVcfHasVariantWithChromPosRefAlt(self, vcf, chrom, pos, ref, alt): v = Reader(filename=vcf) variants = v.fetch(chrom=chrom, start=pos - 1, end=pos) variant_found = False for variant in variants: if variant.CHROM == str(chrom) and \ variant.POS == pos and \ variant.REF == ref and \ alt in variant.ALT: variant_found = True if not variant_found: raise AssertionError( "Variant at {}:{} {}/{} not present in {}".format( chrom, pos, ref, alt, vcf))
def __init__(self, filename, convert_empty=True, **kwargs): """Initialize VCFFile instance. Parameters ---------- filename : required for pandas.DataFrame loading convert_empty : specify whether to convert VCF file empty values to NaN """ Reader.__init__(self, filename=filename, **kwargs) self._init_df(filename) if convert_empty: self._convert_empty_to_nan() self.is_nan_converted = True else: self.is_nan_converted = False
def __init__(self, ref_fpath, out_fhand, length=60, vcf_fpath=None, min_length=None): ''''It inits. The vcf will be used to replace in the reference sequence the SNPs around the SNP of interest with IUPAC codes ''' self._sep = u'\t' self._len = length if min_length is None: min_length = length if min_length > length: msg = 'Minimum length must be smaller than required length' raise ValueError(msg) self._min_len = min_length self._ref_seqs = SeqIO.index(ref_fpath, format='fasta') if vcf_fpath: self._snvs = Reader(filename=vcf_fpath) else: self._snvs = None self._out_fhand = out_fhand out_fhand.write(u'CHROM\tPOS\tID\tseq\n') self._prev_chrom = None
def test_vcf2sfs(self): vcf_file = Reader(filename=self.filename, compressed=True, encoding='utf-8') panel = read_csv(self.panelname, sep=None, engine='python', skipinitialspace=True, index_col=0) panel = panel[panel['pop'] == 'YRI'] result = vcf2sfs(vcf_file, panel, self.chrom, self.start, self.end, select_chr=True) assert all(self.result[0] == result[0]), "Failed test of vcf2sfs (sfs)" assert all(self.result[1] == result[1]), "Failed test of vcf2sfs (sample size)" assert all(self.result[2] == result[2]), "Failed test of vcf2sfs (common variant)"
def get_haplotype_stats(template_vcf: vcf.Reader, in_vcf: vcf.Reader, out): contigs = in_vcf.contigs.keys() hap_stats = HapStats() for contig in contigs: try: template_vcf.fetch(contig) template_chromo = ChromosomoHaplotype(template_vcf, contig) in_chromo = ChromosomoHaplotype(in_vcf, contig) chromo_hap_stats = get_haplotype_stats_chromo( template_chromo, in_chromo, out, contig) hap_stats.insert_hap_stats(chromo_hap_stats) except: continue out.write("%s\t%d\t%d\t%d\t%d\t%.8f\t%.8f\n" % ("total", hap_stats.get_AN50(), hap_stats.get_N50(), hap_stats.get_total_phased(), hap_stats.get_total_spanned(), hap_stats.get_switch_error(), hap_stats.get_mismatch_error()))
def test_calculate_statistics(self): # with freebayes reader = Reader(filename=FREEBAYES_VCF_PATH) vcf_to_compare = VCFcomparisons(FREEBAYES_VCF_PATH) stats = vcf_to_compare.calculate_statistics(reader) assert stats['common'] == 944 assert stats['uncalled'] == 0 assert stats['different'] == 0 assert stats['common_snps_prc'] == 100 # with varscan reader = Reader(filename=VARSCAN_VCF_PATH) vcf_to_compare = VCFcomparisons(VARSCAN_VCF_PATH, samples=['mu16']) stats = vcf_to_compare.calculate_statistics(reader, samples=['mu16']) assert stats['common'] == 107 assert stats['uncalled'] == 69 assert stats['different'] == 0 assert stats['common_snps_prc'] == 100
def parse_vcf(vcf_infile, compressed=True, verbose=True, by_id=True, **tabix_params): t0 = time.time() compressed == vcf_infile.endswith('.gz') vcf_r = Reader(filename=vcf_infile, compressed=compressed) vcf_r.fetch('1', 1) # call a dummy fetch to initialize vcf_r._tabix if tabix_params: vcf_r.reader = vcf_r._tabix.fetch(**tabix_params) cnt_1, cnt_2, cnt_3 = 0, 0, 0 for rec in vcf_r: doc = parse_one_rec(rec) if by_id: # one hgvs id, one doc if doc['_id']: if isinstance(doc['_id'], list): for i, _id in enumerate(doc['_id']): _doc = copy.copy(doc) _doc['alt'] = doc['alt'][i] _doc[POS_KEY] = doc[POS_KEY][i] _doc['_id'] = _id yield _doc cnt_2 += 1 if verbose: print(_doc['rsid'], '\t', _doc['_id']) else: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 else: # one rsid, one doc if doc['_id']: yield doc cnt_2 += 1 if verbose: print(doc['rsid'], '\t', doc['_id']) else: cnt_3 += 1 cnt_1 += 1 print("Done. [{}]".format(timesofar(t0))) print("Total rs: {}; total docs: {}; skipped rs: {}".format(cnt_1, cnt_2, cnt_3))
def write_chromosome(in_vcf: vcf.Reader, out_vcf: vcf.Writer, chromo_haplotype: ChromosomoHaplotype, contig: str): rec: vcf.model._Record for rec in in_vcf.fetch(contig): het = rec.samples[0].gt_type if het != 1: # not het loci out_vcf.write_record(rec) else: record = chromo_haplotype.chromo_record[rec.POS] record.finalize_record(rec) out_vcf.write_record(rec)
def test_errors(self): # 01234 # 1234567890 # CCTGATTT-A # TAACGA # - C -A vcf = '''##fileformat=VCFv4.1 #CHROM POS ID REF ALT QUAL FILTER INFO ref 1 . C T 10 PASS . ref 2 . CT CA,C 10 PASS . ref 3 . T A 10 PASS . ref 4 . G C 10 PASS . ref 5 . A G 10 PASS . ref 6 . T A,C 10 PASS . ref 7 . TT T 10 PASS . ref 8 . T TA 10 PASS . ref 10 . A C 10 PASS . ''' vcf = vcf.replace(' ', '\t') vcf_fhand = NamedTemporaryFile(suffix='.vcf') vcf_fhand.write(vcf) vcf_fhand.flush() vcf_compressed = NamedTemporaryFile(suffix='.vcf.gz') compress_with_bgzip(vcf_fhand, vcf_compressed) index_vcf_with_tabix(vcf_compressed.name) ref_fhand = NamedTemporaryFile(suffix='.fasta') ref_fhand.write('>ref\nACTGATTTA\n') ref_fhand.flush() out_fhand = StringIO() writer = IlluminaWriter(ref_fhand.name, out_fhand, vcf_fpath=vcf_compressed.name) snps = Reader(filename=vcf_compressed.name) snp = snps.next() try: writer.write(snp) self.fail('NotEnoughAdjacentSequenceError expected') except IlluminaWriter.NotEnoughAdjacentSequenceError: pass
def vcf_to_hgvs(build_name, input_handle, output_handle): """ Convert all variants in a VCF file to HGVS. :arg str build_name: Build name. :arg stream input_handle: Open readable handle to a VCF file. :arg stream output_handle: Open writeable handle to a text file. """ mutalyzer = Mutalyzer(build_name) for record in Reader(input_handle): for alt in record.ALT: output_handle.write('{}\n'.format(mutalyzer.vcf_to_hgvs( record.CHROM, record.POS, record.REF, alt)))
def __init__(self): self.filename = projdir + '/chr1vcftest.gz' self.panelname = projdir + '/testpanel.panel' self.vcf_file = Reader(filename=projdir + '/chr1vcftest.gz', compressed=True, encoding='utf-8') self.panel = read_csv(projdir + '/testpanel.panel', sep=None, engine='python', skipinitialspace=True, index_col=0) self.chrom = '1' self.start = 159173097 self.end = 159176290 self.result = ((array([3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 108, ['rs2814778'])) self.rho = 0.9864313429651763
def vcf_to_db(build_name, input_handle, output_handle): """ Convert all variants in a VCF file to database format. :arg str build_name: Build name. :arg stream input_handle: Open readable handle to a VCF file. :arg stream output_handle: Open writeable handle to a text file. """ mutalyzer = Mutalyzer(build_name) for record in Reader(input_handle): for alt in record.ALT: _write_db(output_handle, *mutalyzer.vcf_to_db( record.CHROM, record.POS, record.REF, alt))
def _extract(cls, file_path: Union[str, Path]) -> EvaluationData: records = [] vcf_reader = Reader(open(file_path, "r")) for vcf_record in vcf_reader: chrom = str(vcf_record.CHROM) pos = vcf_record.POS ref = vcf_record.REF alt = (vcf_record.ALT[0] if len(vcf_record.ALT) == 1 else vcf_record.ALT).sequence clnsig = PathogencityClass(vcf_record.INFO["CLNSIG"][0].lower()) variation_type = VariationType(vcf_record.var_type) rg = ReferenceGenome.resolve(vcf_reader.metadata["reference"]) records.append( EvaluationDataEntry(chrom, pos, ref, alt, clnsig, variation_type, rg)) return EvaluationData.from_records(records)
def main(): close_infile = False infile = sys.argv[1] if infile == "-": i = sys.stdin else: i = open(infile, "rU") close_infile = True close_outfile = False outfile = sys.argv[2] if outfile == "-": o = sys.stdout else: o = open(outfile, "w") close_outfile = True group_sizes = tuple(int(i) for i in sys.argv[3].split(",")) group_cnt = range(len(group_sizes)) group_ixs = (0,) + tuple(cumsum(group_sizes)) reader = Reader(i) writer = Writer(o, reader) for rec in reader: fix = [] for i in group_cnt: calls = rec.samples[group_ixs[i]:group_ixs[i+1]] called = False for c in calls: if c.called: called = True break if not called: fix.extend(calls) if len(fix) > 0: for c in fix: # This is a hack because PyVCF _Call objects are not mutable c.data = vcf.model.make_calldata_tuple(c.data._fields)(GT="0/0", DP=c.data.DP, GQ=c.data.GQ, PL=c.data.PL) writer.write_record(rec) if close_infile: i.close() if cloes_outfile: o.close()
def BisSNP_vcf2bed(INfile, OUTfile, mextDir, my_session, logobject): cmd = ['grep -v "#" ' + INfile + ' | wc -l '] rowNum = int(subprocess.check_output(cmd, shell=True)) #create empty numpy array of desired size dtstr = ['string', 'int', 'string', 'string', 'float', 'int', 'int'] for i in range(7, 17): dtstr.insert(i, 'int') colN = [ 'CHROM', 'POS', 'STRAND', 'REF', 'QUAL', 'MQ0', 'DP', 'DP_FREF', 'DP_RREF', 'DP_FALT', 'DP_RALT', 'C_Cstrand', 'T_Cstrand', 'Illegal_Cstrand', 'G_Gstrand', 'A_Gstrand', 'Illegal_Gstrand' ] dummyA = np.empty(shape=(rowNum, 17), dtype='object') CpG_df = pd.DataFrame(dummyA, index=range(0, rowNum), columns=colN) #read in vcf vcf_reader = Reader(open(INfile, 'r')) for record, i in zip(vcf_reader, range(0, rowNum)): CHROM = pd.Series(record.CHROM, name='CHROM') POS = pd.Series(record.POS, name='POS') #skip ALT for CpG calls; skip FILTER STRAND = pd.Series(record.INFO['CS'], name='STRAND') REF = pd.Series(record.REF, name='REF') QUAL = pd.Series(record.QUAL, name='QUAL') MQ0 = pd.Series(record.INFO['MQ0'], name='MQ0') #Context=pd.Series(record.INFO['REF'],name='Context') #Sample=pd.Series(record.samples[0].sample,name='Sample') #don't need for single sample vcf GT, BQ, BRC6, CM, CP, CU, DP, DP4, GP, GQ, SS = record.samples[0].data #skip GT as non-homozygous CGs have been filtered out DP = pd.Series(DP, name='DP') DP4_dict = cll.OrderedDict( zip(['DP_FREF', 'DP_RREF', 'DP_FALT', 'DP_RALT'], DP4)) DP4_df = pd.DataFrame.from_dict(DP4_dict, orient='index').transpose() BRC6_dict = cll.OrderedDict( zip([ 'C_Cstrand', 'T_Cstrand', 'Illegal_Cstrand', 'G_Gstrand', 'A_Gstrand', 'Illegal_Gstrand' ], BRC6)) BRC6_df = pd.DataFrame.from_dict(BRC6_dict, orient='index').transpose() comb_df = pd.concat( [CHROM, POS, STRAND, REF, QUAL, MQ0, DP, DP4_df, BRC6_df], axis=1, join='inner') CpG_df.iloc[i, ] = comb_df.iloc[0, ] CpG_df.to_csv(OUTfile, sep='\t', na_rep='NA', index=False) logobject.info('CpG vcf to txt conversion complete') return
def test_neutrality_from_vcf(vcf_name, panel_name, coord, start, end, sel, reps, select_chr): """Calculate the log odds ratio of the data specified by PyVCF file VCF_NAME, sample details PANEL_NAME and region defined by CHROM, START and END.""" vcf_file = Reader(filename=vcf_name, compressed=True, encoding='utf-8') panel = pd.read_csv(panel_name, sep=None, engine='python', skipinitialspace=True, index_col=0) if sel != (None, None): panel = panel[panel[sel[0]] == sel[1]] sfs, n, non_seg_snps = selectiontest.vcf2sfs(vcf_file, panel, coord, start, end, select_chr) rho = selectiontest.test_neutrality(sfs, variates0=None, variates1=None, reps=reps) click.echo(rho)
def _grid_export_vcf(filename, genome_build, colmodels, items, sample_ids, sample_names_by_id): samples = [sample_names_by_id[s_id] for s_id in sample_ids] info_dict = _get_colmodel_info_dict(colmodels) vcf_template_file = _colmodels_to_vcf_header(genome_build, info_dict, samples) vcf_reader = Reader(vcf_template_file, strict_whitespace=True) pseudo_buffer = StashFile() vcf_writer = Writer(pseudo_buffer, vcf_reader) def iter_row_writer(): for obj in items: record = _grid_item_to_vcf_record(info_dict, obj, sample_ids, samples) vcf_writer.write_record(record) yield pseudo_buffer.value response = StreamingHttpResponse(iter_row_writer(), content_type="text/csv") response['Content-Disposition'] = f'attachment; filename="{filename}.vcf"' return response
def parse(json_filename, vcf_file): amount = 0 with open("errors.txt", "w"): pass vcf_reader = Reader(open(vcf_file)) json_chr = {} for index, record in enumerate(vcf_reader): try: if record.INFO["AF"][ 0] > 0.01: # and record.INFO["non_cancer_AF_popmax"] == 0: # or not "e" in str(x.INFO[keys[0]][0]) amount += 1 if json_chr.get(str(record.CHROM)): json_chr[str(record.CHROM)].append(to_json(record, index)) else: json_chr[str(record.CHROM)] = [to_json(record, index)] # Just skip errors. except KeyError: pass # Testing only # if amount > 6: # break total_data = index + 1 # Write the file used for MongoDB with open(json_filename, "w") as outjson: json.dump(json_chr, outjson) # Extra information about how much data is stored. with open("Output/data_split_{}.txt".format(json_filename.split("/")[1]), "w") as dp: dp.write("Amount of data: {}\nTotal data in file: {}".format( amount, total_data))
def __init__(self, in_vcf: vcf.Reader, chromo: str): self.chromo_record = dict() self.chromo_phase_set = dict() self.chromo_record2phaseset_map = dict() self.graph_struct = graph.Graph() rec: vcf.model._Record ps_label_fix = dict() idx = 0 for rec in in_vcf.fetch(chromo): het = rec.samples[0].gt_type if het != 1: # not het loci continue PS_fix = 0 if rec.samples[0].phased: fmt = rec.FORMAT.split(':') if 'PS' in fmt: PS = rec.samples[0]['PS'] if PS in ps_label_fix.keys(): PS_fix = ps_label_fix[PS] else: ps_label_fix[PS] = rec.POS PS_fix = rec.POS else: PS_fix = 1 record = Record() record.copy_from_rec(rec, PS_fix, idx) idx += 1 self.chromo_record[record.pos] = record if record.ps != 0: PS = record.ps self.chromo_record2phaseset_map[record.pos] = PS phase_set: PhaseSet if PS in self.chromo_phase_set.keys(): phase_set = self.chromo_phase_set[PS] else: phase_set = PhaseSet(record.ps) self.chromo_phase_set[PS] = phase_set phase_set.insert_record(record)
import sys from vcf import Reader import gzip vcf = Reader(open(sys.argv[1], 'r')) n = 0 for v in vcf: if len(v.ALT) > 1: continue if v.QUAL < 20: continue if v.aaf[0] > 0.05: continue n += 1 print(n)
def overwrite_reader_samples(vcf_reader: vcf.Reader, samples): vcf_reader.samples = samples vcf_reader._sample_indexes = dict([(x,i) for (i,x) in enumerate(vcf_reader.samples)])
class IlluminaWriter(object): '''It writes the SNPs in Illumina format ref_fpath should be in fasta format and it has to have a name attribute. min_maf controls the SNPs reported in the adjacent segments as IUPAC codes. ''' # TODO add extra error classes # TODO include the error classes inside this class to easy access class NotEnoughAdjacentSequenceError(Exception): pass def __init__(self, ref_fpath, out_fhand, length=60, vcf_fpath=None, min_length=None): ''''It inits. The vcf will be used to replace in the reference sequence the SNPs around the SNP of interest with IUPAC codes ''' self._sep = u'\t' self._len = length if min_length is None: min_length = length if min_length > length: msg = 'Minimum length must be smaller than required length' raise ValueError(msg) self._min_len = min_length self._ref_seqs = SeqIO.index(ref_fpath, format='fasta') if vcf_fpath: self._snvs = Reader(filename=vcf_fpath) else: self._snvs = None self._out_fhand = out_fhand out_fhand.write(u'CHROM\tPOS\tID\tseq\n') self._prev_chrom = None def write(self, snv): chrom_name = snv.CHROM prev_chrom = self._prev_chrom if prev_chrom is None or prev_chrom.name != chrom_name: chrom = self._ref_seqs[chrom_name] self._prev_chrom = chrom else: chrom = prev_chrom length = self._len min_len = self._min_len snv_start = snv.start # 0 based snv_end = snv.end # 1 based desired_start = snv_start - length # desired segment start end = snv_end + length # desired segment end chrom_seq = chrom.seq first_segment = unicode(chrom_seq[desired_start:snv_start]) if len(first_segment) < min_len: msg = "Not enough sequence in 3'. ID: %s, POS: %d, CHROM: %s" msg %= (snv.ID, snv.POS, snv.CHROM) raise self.NotEnoughAdjacentSequenceError(msg) if self._snvs: real_start = snv_start - len(first_segment) close_snvs = self._snvs.fetch(chrom.name, start=real_start, end=snv_start) first_segment = _replace_snvs_with_iupac(first_segment, close_snvs, seq_offset=real_start) snv_segment = _build_snv_section(snv) second_segment = unicode(chrom_seq[snv_end:end]) if len(second_segment) < min_len: msg = "Not enough sequence in 5'. ID: %s, POS: %d, CHROM: %s" msg %= (snv.ID, snv.POS, snv.CHROM) raise self.NotEnoughAdjacentSequenceError(msg) if self._snvs: real_end = snv_end + len(second_segment) close_snvs = self._snvs.fetch(chrom.name, start=snv_end, end=real_end) second_segment = _replace_snvs_with_iupac(second_segment, close_snvs, seq_offset=snv_end) out_fhand = self._out_fhand sep = self._sep out_fhand.write(unicode(snv.CHROM)) out_fhand.write(sep) out_fhand.write(unicode(snv.POS)) out_fhand.write(sep) snp_id = snv.ID if snp_id is None: snp_id = u'.' out_fhand.write(snp_id) out_fhand.write(sep) out_fhand.write(first_segment) out_fhand.write(snv_segment) out_fhand.write(second_segment) out_fhand.write(u'\n') def flush(self): self._out_fhand.flush() def close(self): self._out_fhand.close()
def assertVcfHasSample(self, vcf, sample): v = Reader(filename=vcf) if sample not in v.samples: raise AssertionError("Sample {} not present in {}".format( sample, vcf))
def records_from_vcf(vcf_file: str) -> list: """Creates a list of VCF Record objects from a VCF file""" # Load the VCF vcf_reader: Reader = Reader(open(vcf_file, 'r')) # Create the list of records records: list = [] total: int = 0 n_failed: int = 0 n_with_too_many_unknown_genotypes: int = 0 n_below_threshold: int = 0 data_set_ploidity: None = None while True: try: total += 1 r = next(vcf_reader) chromosome: str = r.CHROM position: int = r.POS ref_allele: str = r.REF alt_alleles: list = r.ALT if data_set_ploidity is None: data_set_ploidity: int = r.samples[0].ploidity assert data_set_ploidity == 1 or data_set_ploidity == 2 genotypes, n_alternates = get_genotypes( data_set_ploidity=data_set_ploidity, r=r, ref_allele=ref_allele, alt_alleles=alt_alleles ) assert len(genotypes) == len(r.samples) # Only add records if the ratio of alternate alleles to total alleles exceeds the (MAF) filter threshold if n_alternates / len(r.samples) >= MAF_THRESHOLD: records.append(VCFRecordObj(chromosome=chromosome, position=position, genotypes=genotypes)) else: n_below_threshold += 1 except RuntimeError: # There was a missing genotype in one of the samples of the current record n_with_too_many_unknown_genotypes += 1 continue except ValueError: # The current record failed because the call to next above raised and exception n_failed += 1 continue except StopIteration: # Decrement the total number of records since the latest increment occurred after the last iteration total -= 1 break assert len(records) + n_with_too_many_unknown_genotypes + n_below_threshold + n_failed == total print('For {}:'.format(vcf_file, MAF_THRESHOLD)) print('Minor Allele Frequency Threshold: {0:.2f}'.format(MAF_THRESHOLD)) # Print the percentages of the records that will actually be used, the records with missing genotypes, the records # that didn't have enough alternate alleles, and the records that flat out failed print_percentage(msg='records successfully added', amount=len(records), total=total) print_percentage( msg='records with too many missing genotypes', amount=n_with_too_many_unknown_genotypes, total=total ) print_percentage(msg='records below the minor allele frequency threshold', amount=n_below_threshold, total=total) print_percentage(msg='records that failed to parse', amount=n_failed, total=total) print('Total number of records:', total) return records
#! /usr/bin/python from vcf import Reader from sys import argv path=argv[1] reader=Reader(filename=path) print('CHROM\tPOS\tREF\tALT\tSAMPLE\tINDEL\tHW\tMASKED\tINFORM\tREP\tA_INDEL\tA_SNP\tANNO') calls=len(reader.samples) output.write('calls:'+str(calls)+'\n') for record in reader: info={'CHROM':record.CHROM, 'POS':record.POS, 'REF':record.REF, 'ALT':record.ALT, 'SAMPLE':[], 'INDEL':'', 'HW':'', 'MASKED':'', 'INFORM':'', 'REP':'', 'A_INDEL':'', 'A_SNP':'', 'ANNO':''} for sample in record.samples: if len(sample.data)>1: info['SAMPLE'].append((sample.sample,sample.data[0],sample.data[1])) else: info['SAMPLE'].append((sample.sample,sample.data[0],None)) print(str(info['CHROM'])+'\t'+str(info['POS'])+'\t'+str(info['REF'])+'\t'+str(info['ALT'])+'\t'+str(info['SAMPLE'])+'\t'+str(info['INDEL'])+'\t'+str(info['HW'])+'\t'+str(info['MASKED'])+'\t'+str(info['INFORM'])+'\t'+str(info['REP'])+'\t'+str(info['A_INDEL'])+'\t'+str(info['A_SNP'])+'\t'+info['ANNO'])