def get_svs(self): """Get svs objects from sample vcf.""" sv_list = [SV(rec) for rec in VariantFile(self.vcf).fetch()] sv_name_dict = {sv.name: sv for sv in sv_list} return sv_name_dict
def match_database(args): # Load FASTA reference refs = Fastafile(expanduser(args.reference)) # Open input variant files db = VariantFile(args.database) sample = VariantFile(args.sample) format_meta = [] for fmt, meta in db.header.formats.items(): if fmt not in sample.header.formats: format_meta.append(meta.name) sample.header.formats.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.formats.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.formats.add( meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) info_meta = [] for info, meta in db.header.info.items(): if info not in sample.header.info: info_meta.append(meta.name) sample.header.info.add(meta.name + '_FOUND', number='.', type=meta.type, description='Allele(s) found: ' + meta.description) sample.header.info.add(meta.name + '_NOTFOUND', number='.', type=meta.type, description='Allele(s) not found: ' + meta.description) sample.header.info.add( meta.name + '_NOCALL', number='.', type=meta.type, description='Allele(s) with uncertain presense: ' + meta.description) with VariantFile(args.output, 'w', header=sample.header) as out: # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, [sample, db], [args.name, None], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [ sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci ] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (superlocus, alleles) in superloci: alleles.sort(key=NormalizedLocus.natural_order_key) superlocus.sort(key=NormalizedLocus.natural_order_key) for allele in alleles: super_allele = [ locus for locus in superlocus if locus.extremes_intersect(allele) ] # Remove all reference calls from the superlocus. # This is primarily done to remove long leading and trailing reference regions. # Interstitial reference regions will be added back, based on how gaps are handled. super_non_ref = [ locus for locus in super_allele if not locus.is_ref() ] if args.debug: super_start, super_stop = get_superlocus_bounds( [[allele], super_non_ref]) print('-' * 80, file=sys.stderr) print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop), file=sys.stderr) print(file=sys.stderr) print(' ALLELE: {} {}:[{}-{}) ref={} alt={}'.format( allele.record.id, allele.contig, allele.start, allele.stop, allele.alleles[0] or '-', allele.alleles[1] or '-'), file=sys.stderr) print(file=sys.stderr) for i, locus in enumerate(super_non_ref, 1): lref = locus.alleles[0] or '-' indices = locus.allele_indices if indices.count(None) == len(indices): geno = 'nocall' elif indices.count(0) == len(indices): geno = 'refcall' else: sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(' VAR{:d}: {}[{:5d}-{:5d}) ref={} geno={}'. format(i, locus.contig, locus.start, locus.stop, lref, geno), file=sys.stderr) # Search superlocus for allele match_zygosity = find_allele(ref, allele, super_non_ref, debug=args.debug) if args.debug: print(file=sys.stderr) print(' MATCH={}'.format(match_zygosity), file=sys.stderr) print(file=sys.stderr) # Annotate results of search if match_zygosity is None: suffix = '_NOCALL' elif match_zygosity == 0: suffix = '_NOTFOUND' else: suffix = '_FOUND' # Number of times to repeat the copied metadata times = match_zygosity if suffix == '_FOUND' else 1 for locus in super_allele: annotate_info(locus, allele, info_meta, suffix, times) annotate_format(locus, allele, format_meta, suffix, times) for locus in sorted(superlocus, key=NormalizedLocus.record_order_key): out.write(locus.record)
def run_process(opts, inputvcf): db_file = opts.database outputvcf = opts.output minhomopolyx = int(opts.minhomopolyx) minrepeatcount = int(opts.minrepeatcount) maxvaf = float(opts.maxvaf) indelmaxdp = int(opts.indelmaxdp) indelmaxao = int(opts.indelmaxao) indelmaxvaf = float(opts.indelmaxvaf) snvmaxdp = int(opts.snvmaxdp) # Get Lowconf Database (obj1 : standard, obj2 : range) lowconfobj1, lowconfobj2 = lowconfdb2obj(db_file) # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header if not ngb_functions.vcfHeaderCheck(vcf_in.header.info, "LOW_CONFIDENCE"): vcf_in.header.info.add("LOW_CONFIDENCE", ".", "String", "Low Confidence Type") # Add FILTER to Header if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "homopolymer"): vcf_in.header.filters.add("homopolymer", None, None, "Homopolymer Sequence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "repeat_sequence"): vcf_in.header.filters.add("repeat_sequence", None, None, "Repeat Sequence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "sequencing_error"): vcf_in.header.filters.add("sequencing_error", None, None, "Sequencing Error Low Confidence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "mapping_error"): vcf_in.header.filters.add("mapping_error", None, None, "Mapping Error Low Confidence Region") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "snp_candidate"): vcf_in.header.filters.add("snp_candidate", None, None, "SNP Candidates") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "strand_biased"): vcf_in.header.filters.add("strand_biased", None, None, "Strand Biased (Freebayes)") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "lowcoverage_indel"): vcf_in.header.filters.add("lowcoverage_indel", None, None, "Low Coverage (DP,AO,VAF) Indels") if not ngb_functions.vcfHeaderCheck(vcf_in.header.filters, "lowcoverage_snv"): vcf_in.header.filters.add("lowcoverage_snv", None, None, "Low Coverage (DP) SNVs") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts vaf = float(record.samples[0]["NGB_VAF"][0]) ao = int(record.samples[0]["NGB_AO"][0]) dp = int(record.samples[0]["NGB_DP"]) vtype = record.info["TYPE"][0] reflen = len(record.ref) altlen = len(record.alts[0]) """ if "ngb_cv_rcv_sig_description" in record.info: tmpcv = record.info["ngb_cv_rcv_sig_description"][0] cv = tmpcv.split("|") else: cv = list() """ seqerror_info_list = list() strandbiased_info_list = list() homopolymer_info_list = list() repeat_info_list = list() saf_format_list = list() sar_format_list = list() lowcov_indel_list = list() lowcov_snv_list = list() for i, alt in enumerate(alts): # Get Lowconf info lowconf = "" id1 = chrom + '-' + str(pos) + '-' + ref + '-' + alt if id1 in lowconfobj1: lowconf = lowconfobj1[id1] else: lowconf = "" # Get Lowconf Info from range database for lowconfdata in lowconfobj2: if chrom == lowconfdata["chrom"] and pos in range( int(lowconfdata["start"]), int(lowconfdata["end"]) + 1): lowconf = lowconfdata["type"] seqerror_info_list.append(lowconf) # Get Strand Biased Information strandbiased = "" # (Freebayes) if "SAF" in record.info: if record.info["SAF"][i] == 0 or record.info["SAR"][ i] == 0 or record.info["RPR"][i] < 1 or record.info[ "RPL"][i] < 1: strandbiased = "strand_biased" else: strandbiased = "" """ # Mutect elif "F1R2" in record.format: alt_f1r2 = record.samples[0]['F1R2'][i+1] alt_f2r1 = record.samples[0]['F2R1'][i+1] if alt_f1r2 == 0 or alt_f2r1 == 0: strandbiased = "strand_biased" else: strandbiased = "" saf_format_list.append(alt_f1r2) sar_format_list.append(alt_f2r1) """ strandbiased_info_list.append(strandbiased) # Homopolymer & Repeat Sequence Filtering (VAF, CV) homopolymerinfo = "" repeatinfo = "" #if vaf < maxvaf and ("Pathogenic" not in cv) and ("Likely_pathogenic" not in cv): if vaf < maxvaf: # Get Homopolymer Info if "HOMOPOLYX" in record.info: if int(record.info["HOMOPOLYX"][0]) >= minhomopolyx: homopolymerinfo = "homopolymer" else: homopolymerinfo = "" # Get Repeat Info if "REPEAT_COUNT" in record.info: if int(record.info["REPEAT_COUNT"][0]) >= minrepeatcount: repeatinfo = "repeat_sequence" else: repeatinfo = "" homopolymer_info_list.append(homopolymerinfo) repeat_info_list.append(repeatinfo) # Indel Filtering lowcovindelinfo = "" if (altlen != reflen) and (vtype == "ins" or vtype == "del" or vtype == "complex"): if vaf < indelmaxvaf or ao < indelmaxao or dp < indelmaxdp: lowcovindelinfo = "lowcoverage_indel" else: lowcovindelinfo = "" else: lowcovindelinfo = "" lowcov_indel_list.append(lowcovindelinfo) # SNV Filtering lowcovsnvinfo = "" if (altlen == reflen) and (vtype == "snp" or vtype == "complex"): if dp < snvmaxdp: lowcovsnvinfo = "lowcoverage_snv" else: lowcovsnvinfo = "" else: lowcovsnvinfo = "" lowcov_snv_list.append(lowcovsnvinfo) lowconf_info_list = list() for i, itema in enumerate(seqerror_info_list): itemb = strandbiased_info_list[i] itemc = homopolymer_info_list[i] itemd = repeat_info_list[i] iteme = lowcov_indel_list[i] itemf = lowcov_snv_list[i] itemm = "" if itema != '': itemm += itema + "|" if itemb != '': itemm += itemb + "|" if itemc != '': itemm += itemc + "|" if itemd != '': itemm += itemd + "|" if iteme != '': itemm += iteme + "|" if itemf != '': itemm += itemf + "|" if itemm != '': itemn = itemm[0:-1] else: itemn = '' if itemn != '': lowconf_info_list.append(itemn) if lowconf_info_list != []: info_value = ','.join(str(e) for e in lowconf_info_list) record.info['LOW_CONFIDENCE'] = info_value # Add FILTER lowconf_infolist = list() if 'LOW_CONFIDENCE' in record.info: for lowconf_info in record.info['LOW_CONFIDENCE']: lowconf_infolist += lowconf_info.split("|") lowconf_infolist = list(set(lowconf_infolist)) for lowconf_info in lowconf_infolist: record.filter.add(lowconf_info) # PASS FILTER if list(record.filter) == []: record.filter.add("PASS") # Remove Filter for rf in remove_filter_list: if rf in list(record.filter): record.filter.__delitem__(rf) # Write VCF vcf_out.write(record)
def fetch(self, chrm, pos_start, pos_end, return_samples=False): vcf_file = "%s.%s.vcf.gz" % (self.pop_vcf_stem, chrm) vcf_open = VariantFile(vcf_file, drop_samples=(not return_samples)) return vcf_open.fetch(chrm, pos_start, pos_end)
#!/group/ctan/anaconda3/envs/snakemake/bin/python import sys from vcf_ctan import samvcf from pysam import VariantFile samples= ["AC","BD","Commander","EC2.1","EC2.2","EC7.1","EC7.2","Fleet","Hindmarsh","La_Trobe","Scope","Vlamingh","W1","WI4304","X1","barke","bowman","haruna_Nijo","igri","spontaneum_B1k-04-12"] smps = [samples[3],samples[4],samples[5],samples[6]] ibcf = VariantFile(sys.argv[1]) #obcf = VariantFile(sys.argv[2],'w',header=ibcf.header) ofile = open(sys.argv[2],"w") hd = "\t".join(["#chr","pos","len","ref","ref_num","alt","alt_num") ofile.write(hd) for one in ibcf.fetch("chr3H"): record = samvcf(one) if record.flt and record.diff_repeat(smps): opt = record.opt + [str(sum(one.samples[smps[0]]['GT'])),",".join(list(map(str,one.samples[smps[0]]['AD']))),str(sum(one.samples[smps[1]]['GT'])),",".join(list(map(str,one.samples[smps[1]]['AD']))),str(sum(one.samples[smps[2]]['GT'])),",".join(list(map(str,one.samples[smps[2]]['AD']))),str(sum(one.samples[smps[3]]['GT'])),",".join(list(map(str,one.samples[smps[3]]['AD'])))] ofile.write("\t".join(opt) + "\n")
#!/usr/bin/env python3 from pysam import VariantFile import sys vcf_in = VariantFile(sys.argv[1], 'r') vcf_out = VariantFile('-', 'w', header=vcf_in.header) cp = (0, 0) for rec in vcf_in.fetch(): if (rec.chrom, rec.pos) != cp: vcf_out.write(rec) cp = (rec.chrom, rec.pos)
async def import_data(file_id, filepath, core=None, reference_id=2): import ipdb import os import datetime import sqlalchemy import subprocess import multiprocessing as mp import reprlib import gzip from pysam import VariantFile from core.framework.common import log, war, err, RegovarException import core.model as Model # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tools # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def count_vcf_row(filename): """ Use linux OS commands to quickly count variant to parse in the vcf file """ bashCommand = 'grep -v "^#" ' + str(filename) + ' | wc -l' if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) cmd_out = process.communicate()[0] return int(cmd_out.decode('utf8')) def debug_clear_header(filename): """ A workaround to fix a bug with GVCF header with pysam EDIT : in fact the problem to be that pysam do not support some kind of compression, so this command is still used to rezip the vcf in a supported format. """ bashCommand = "grep -v '^##GVCFBlock' {} | gzip --best > /var/regovar/downloads/tmp_workaround".format( filename) if filename.endswith("gz"): bashCommand = "z" + bashCommand process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) bashCommand = "mv /var/regovar/downloads/tmp_workaround {} ".format( filename) process = subprocess.Popen(bashCommand, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def prepare_vcf_parsing(filename): """ Parse vf headers and return information about which data shall be parsed and stored in the database """ # Extract headers debug_clear_header(filename) headers = {} samples = [] _op = open if filename.endswith('gz') or filename.endswith('zip'): _op = gzip.open with _op(filename) as f: for line in f: if _op != open: line = line.decode() if line.startswith('##'): l = line[2:].strip() l = [l[0:l.index('=')], l[l.index('=') + 1:]] if l[0] not in headers.keys(): if l[0] == 'INFO': headers[l[0]] = {} else: headers[l[0]] = [] if l[0] == 'INFO': data = l[1][1:-1].split(',') info_id = data[0][3:] info_type = data[2][5:] info_desc = data[3][13:-1] headers['INFO'].update({ info_id: { 'type': info_type, 'description': info_desc } }) else: headers[l[0]].append(l[1]) elif line.startswith('#'): samples = line[1:].strip().split('\t')[9:] else: break # Check for VEP vep = {'vep': False} if 'VEP' in headers.keys() and 'CSQ' in headers['INFO'].keys(): d = headers['INFO']['CSQ']['description'].split('Format:') vep = { 'vep': { 'version': headers['VEP'][0].split(' ')[0], 'flag': 'CSQ', 'name': 'VEP', 'db_type': 'transcript', 'db_pk_field': 'Feature', 'description': d[0].strip(), 'columns': d[1].strip().split('|'), } } if 'Feature' not in vep['vep']['columns']: vep = {'vep': False} # Check for SnpEff snpeff = {'snpeff': False} if 'SnpEffVersion' in headers.keys(): if 'ANN' in headers['INFO'].keys(): # TODO pass elif 'EFF' in headers['INFO'].keys(): d = headers['INFO']['EFF']['description'].split('\'') snpeff = { 'snpeff': { 'version': headers['SnpEffVersion'][0].strip().strip('"').split( ' ')[0], 'flag': 'EFF', 'name': 'SnpEff', 'db_type': 'transcript', 'db_pk_field': 'Transcript_ID', 'columns': [c.strip() for c in d[1].strip().split('|')], 'description': d[0].strip(), } } if 'Transcript_ID' not in snpeff['snpeff']['columns']: snpeff = {'snpeff': False} # Retrieve extension file_type = os.path.split(filename)[1].split('.')[-1] if not 'vcf' in file_type: file_type += os.path.split(filename)[1].split('.')[-2] + "." # Return result result = { 'vcf_version': headers['fileformat'][0], 'name': os.path.split(filename)[1], 'count': count_vcf_row(filename), 'size': os.path.getsize(filename), 'type': file_type, 'samples': samples, 'annotations': {} } result['annotations'].update(vep) result['annotations'].update(snpeff) return result def normalise_annotation_name(name): """ Tool to convert a name of a annotation tool/db/field/version into the corresponding valid name for the database """ if name[0].isdigit(): name = '_' + name def check_char(char): if char in ['.', '-', '_', '/']: return '_' elif char.isalnum(): # TODO : remove accents return char.lower() else: return '' return ''.join(check_char(c) for c in name) def create_annotation_db(reference_id, reference_name, table_name, vcf_annotation_metadata): """ Create an annotation database according to information retrieved from the VCF file with the prepare_vcf_parsing method """ # Create annotation table pk = 'transcript_id character varying(100), ' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pk2 = ',transcript_id' if vcf_annotation_metadata[ 'db_type'] == 'transcript' else '' pattern = "CREATE TABLE {0} (variant_id bigint, bin integer, chr integer, pos bigint, ref text, alt text, " + pk + "{1}, CONSTRAINT {0}_ukey UNIQUE (variant_id" + pk2 + "));" query = "" db_map = {} fields = [] for col in vcf_annotation_metadata['columns']: col_name = normalise_annotation_name(col) fields.append("{} text".format(col_name)) db_map[col_name] = { 'name': col_name, 'type': 'string', 'name_ui': col } # By default, create a table with only text field. Type can be changed by user via a dedicated UI query += pattern.format(table_name, ', '.join(fields)) query += "CREATE INDEX {0}_idx_vid ON {0} USING btree (variant_id);".format( table_name) query += "CREATE INDEX {0}_idx_var ON {0} USING btree (bin, chr, pos);".format( table_name) if vcf_annotation_metadata['db_type'] == 'transcript': query += "CREATE INDEX {0}_idx_tid ON {0} USING btree (transcript_id);".format( table_name) # Register annotation db_uid, pk_uid = Model.execute( "SELECT MD5('{0}'), MD5(concat(MD5('{0}'), '{1}'))".format( table_name, normalise_annotation_name( vcf_annotation_metadata['db_pk_field']))).first() query += "INSERT INTO annotation_database (uid, reference_id, name, version, name_ui, description, ord, type, db_pk_field_uid, jointure) VALUES " query += "('{0}', {1}, '{2}', '{3}', '{4}', '{5}', {6}, '{7}', '{8}', '{2} ON {2}.bin={{0}}.bin AND {2}.chr={{0}}.chr AND {2}.pos={{0}}.pos AND {2}.ref={{0}}.ref AND {2}.alt={{0}}.alt AND {2}.transcript_id={{0}}.transcript_pk_value');".format( # We removed this condition /*AND {{0}}.transcript_pk_field_uid=\"{8}\"*/ in the jointure as this condition is already done by a previous query when updating working table with annotations db_uid, reference_id, table_name, vcf_annotation_metadata['version'], vcf_annotation_metadata['name'], vcf_annotation_metadata['description'], 30, vcf_annotation_metadata['db_type'], pk_uid) query += "INSERT INTO annotation_field (database_uid, ord, name, name_ui, type) VALUES " for idx, f in enumerate(vcf_annotation_metadata['columns']): query += "('{0}', {1}, '{2}', '{3}', 'string'),".format( db_uid, idx, normalise_annotation_name(f), f) Model.execute(query[:-1]) Model.execute( "UPDATE annotation_field SET uid=MD5(concat(database_uid, name)) WHERE uid IS NULL;" ) return db_uid, db_map def prepare_annotation_db(reference_id, vcf_annotation_metadata): """ Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema """ reference = Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first()[0] table_name = normalise_annotation_name('{}_{}_{}'.format( vcf_annotation_metadata['flag'], vcf_annotation_metadata['version'], reference)) # Get database schema (if available) table_cols = {} db_uid = Model.execute( "SELECT uid FROM annotation_database WHERE name='{}'".format( table_name)).first() if db_uid is None: # No table in db for these annotation : create new table db_uid, table_cols = create_annotation_db(reference_id, reference, table_name, vcf_annotation_metadata) else: db_uid = db_uid[0] # Table already exists : retrieve columns already defined for col in Model.execute( "SELECT name, name_ui, type FROM annotation_field WHERE database_uid='{}'" .format(db_uid)): table_cols[col.name] = { 'name': col.name, 'type': col.type, 'name_ui': col.name_ui } # Get diff between columns in vcf and columns in DB, and update DB schema diff = [] for col in vcf_annotation_metadata['columns']: if normalise_annotation_name(col) not in table_cols.keys(): diff.append(col) if len(diff) > 0: offset = len(vcf_annotation_metadata['columns']) query = "" for idx, col in enumerate(diff): name = normalise_annotation_name(col) query += "ALTER TABLE {0} ADD COLUMN {1} text; INSERT INTO public.annotation_field (database_uid, ord, name, name_ui, type) VALUES ('{2}', {3}, '{1}', '{4}', 'string');".format( table_name, name, db_uid, offset + idx, col) table_cols[name] = { 'name': name, 'type': 'string', 'name_ui': col } # execute query Model.execute(query) # Update vcf_annotation_metadata with database mapping db_pk_field_uid = Model.execute( "SELECT db_pk_field_uid FROM annotation_database WHERE uid='{}'". format(db_uid)).first().db_pk_field_uid vcf_annotation_metadata.update({ 'table': table_name, 'db_uid': db_uid, 'db_pk_field_uid': db_pk_field_uid }) vcf_annotation_metadata['db_map'] = {} for col in vcf_annotation_metadata['columns']: vcf_annotation_metadata['db_map'][col] = table_cols[ normalise_annotation_name(col)] return vcf_annotation_metadata def normalize_chr(chrm): """ Normalize chromosome number from VCF format into Database format """ chrm = chrm.upper() if chrm.startswith("CHROM"): chrm = chrm[5:] if chrm.startswith("CHRM") and chrm != "CHRM": chrm = chrm[4:] if chrm.startswith("CHR"): chrm = chrm[3:] if chrm == "X": chrm = 23 elif chrm == "Y": chrm = 24 elif chrm == "M": chrm = 25 else: try: chrm = int(chrm) except Exception as error: # TODO log /report error chrm = None return chrm def normalize(pos, ref, alt): """ Normalize given (position, ref and alt) from VCF into Database format - Assuming that position in VCF are 1-based (0-based in Database) - triming ref and alt to get minimal alt (and update position accordingly) """ # input pos comming from VCF are 1-based. # to be consistent with UCSC databases we convert it into 0-based pos -= 1 if (ref == alt): return None, None, None if ref is None: ref = '' if alt is None: alt = '' while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]: ref = ref[1:] alt = alt[1:] pos += 1 if len(ref) == len(alt): while ref[-1:] == alt[-1:]: ref = ref[0:-1] alt = alt[0:-1] return pos, ref, alt def normalize_gt(infos): """ Normalize GT sample informatin from VCF format into Database format """ gt = get_info(infos, 'GT') if gt != 'NULL': if infos['GT'][0] == infos['GT'][1]: # Homozyot ref if infos['GT'][0] in [None, 0]: return 0 # Homozyot alt return '1' else: if 0 in infos['GT']: # Hetero ref return '2' else: return '3' log("unknow : " + str(infos['GT'])) return -1 def get_alt(alt): """ Retrieve alternative values from VCF data """ if ('|' in alt): return alt.split('|') else: return alt.split('/') def get_info(infos, key): """ Retrieving info annotation from VCF data """ if (key in infos): if infos[key] is None: return 'NULL' return infos[key] return 'NULL' def is_transition(ref, alt): """ Return true if the variant is a transversion; false otherwise """ tr = ref + alt if len(ref) == 1 and tr in ('AG', 'GA', 'CT', 'TC'): return True return False def escape_value_for_sql(value): if type(value) is str: value = value.replace('%', '%%') value = value.replace("'", "''") return value # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Tiers code from vtools. Bin index calculation # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Utility function to calculate bins. # # This function implements a hashing scheme that UCSC uses (developed by Jim Kent) to # take in a genomic coordinate range and return a set of genomic "bins" that your range # intersects. I found a Java implementation on-line (I need to find the URL) and I # simply manually converted the Java code into Python code. # IMPORTANT: Because this is UCSC code the start coordinates are 0-based and the end # coordinates are 1-based!!!!!! # BINRANGE_MAXEND_512M = 512 * 1024 * 1024 # binOffsetOldToExtended = 4681; # (4096 + 512 + 64 + 8 + 1 + 0) _BINOFFSETS = ( 512 + 64 + 8 + 1, # = 585, min val for level 0 bins (128kb binsize) 64 + 8 + 1, # = 73, min val for level 1 bins (1Mb binsize) 8 + 1, # = 9, min val for level 2 bins (8Mb binsize) 1, # = 1, min val for level 3 bins (64Mb binsize) 0) # = 0, only val for level 4 bin (512Mb binsize) # 1: 0000 0000 0000 0001 1<<0 # 8: 0000 0000 0000 1000 1<<3 # 64: 0000 0000 0100 0000 1<<6 # 512: 0000 0010 0000 0000 1<<9 _BINFIRSTSHIFT = 17 # How much to shift to get to finest bin. _BINNEXTSHIFT = 3 # How much to shift to get to next larger bin. _BINLEVELS = len(_BINOFFSETS) # # IMPORTANT: the start coordinate is 0-based and the end coordinate is 1-based. # def getUcscBins(start, end): bins = [] startBin = start >> _BINFIRSTSHIFT endBin = (end - 1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i] if startBin == endBin: bins.append(startBin + offset) else: for bin in range(startBin + offset, endBin + offset): bins.append(bin) startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bins def getMaxUcscBin(start, end): bin = 0 startBin = start >> _BINFIRSTSHIFT endBin = (end - 1) >> _BINFIRSTSHIFT for i in range(_BINLEVELS): offset = _BINOFFSETS[i] if startBin == endBin: if startBin + offset > bin: bin = startBin + offset else: for i in range(startBin + offset, endBin + offset): if i > bin: bin = i startBin >>= _BINNEXTSHIFT endBin >>= _BINNEXTSHIFT return bin # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # Import # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # def transaction_end(job_id, result): job_in_progress.remove(job_id) if result is Exception or result is None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': 'Error occured : ' + str(err) } }) start_0 = datetime.datetime.now() job_in_progress = [] vcf_metadata = prepare_vcf_parsing(filepath) db_ref_suffix = "_" + Model.execute( "SELECT table_suffix FROM reference WHERE id={}".format( reference_id)).first().table_suffix # Prepare database for import of custom annotation, and set the mapping between VCF info fields and DB schema for annotation in vcf_metadata['annotations'].keys(): if vcf_metadata['annotations'][annotation]: data = prepare_annotation_db( reference_id, vcf_metadata['annotations'][annotation]) vcf_metadata['annotations'][annotation].update(data) if filepath.endswith(".vcf") or filepath.endswith(".vcf.gz"): start = datetime.datetime.now() # Create vcf parser vcf_reader = VariantFile(filepath) # get samples in the VCF samples = { i: Model.get_or_create(Model.session(), Model.Sample, name=i)[0] for i in list((vcf_reader.header.samples)) } if len(samples.keys()) == 0: war("VCF files without sample cannot be imported in the database.") if core is not None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': "VCF files without sample cannot be imported in the database." } }) return if core is not None: core.notify_all({ 'msg': 'import_vcf_start', 'data': { 'file_id': file_id, 'samples': [{ 'id': samples[s].id, 'name': samples[s].name } for s in samples.keys()] } }) # Associate sample to the file Model.execute( "INSERT INTO sample_file (sample_id, file_id) VALUES {0} ON CONFLICT DO NOTHING;" .format(','.join([ "({0}, {1})".format(samples[sid].id, file_id) for sid in samples ]))) # parsing vcf file records_count = vcf_metadata['count'] records_current = 0 table = "variant" + db_ref_suffix log("Importing file {0}\n\r\trecords : {1}\n\r\tsamples : ({2}) {3}\n\r\tstart : {4}" .format(filepath, records_count, len(samples.keys()), reprlib.repr([s for s in samples.keys()]), start)) # bar = Bar('\tparsing : ', max=records_count, suffix='%(percent).1f%% - %(elapsed_td)s') sql_pattern1 = "INSERT INTO {0} (chr, pos, ref, alt, is_transition, bin, sample_list) VALUES ({1}, {2}, '{3}', '{4}', {5}, {6}, array[{7}]) ON CONFLICT (chr, pos, ref, alt) DO UPDATE SET sample_list=array_intersect({0}.sample_list, array[{7}]) WHERE {0}.chr={1} AND {0}.pos={2} AND {0}.ref='{3}' AND {0}.alt='{4}';" sql_pattern2 = "INSERT INTO sample_variant" + db_ref_suffix + " (sample_id, variant_id, bin, chr, pos, ref, alt, genotype, depth) SELECT {0}, id, {1}, '{2}', {3}, '{4}', '{5}', '{6}', {7} FROM variant" + db_ref_suffix + " WHERE bin={1} AND chr={2} AND pos={3} AND ref='{4}' AND alt='{5}' ON CONFLICT (sample_id, variant_id) DO NOTHING;" sql_pattern3 = "INSERT INTO {0} (variant_id, bin,chr,pos,ref,alt, transcript_id, {1}) SELECT id, {3},{4},{5},'{6}','{7}', '{8}', {2} FROM variant" + db_ref_suffix + " WHERE bin={3} AND chr={4} AND pos={5} AND ref='{6}' AND alt='{7}' ON CONFLICT (variant_id, transcript_id) DO NOTHING;" # TODO : on conflict, shall update fields with value in the VCF to complete database annotation with (maybe) new fields sql_query1 = "" sql_query2 = "" sql_query3 = "" count = 0 for r in vcf_reader: records_current += 1 if core is not None: core.notify_all({ 'msg': 'import_vcf', 'data': { 'file_id': file_id, 'progress_total': records_count, 'progress_current': records_current, 'progress_percent': round(records_current / max(1, records_count) * 100, 2) } }) chrm = normalize_chr(str(r.chrom)) samples_array = ','.join([str(samples[s].id) for s in r.samples]) for sn in r.samples: s = r.samples.get(sn) if (len(s.alleles) > 0): pos, ref, alt = normalize(r.pos, r.ref, s.alleles[0]) if pos is not None and alt != ref: bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format( table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format( samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 pos, ref, alt = normalize(r.pos, r.ref, s.alleles[1]) if pos is not None and alt != ref: bin = getMaxUcscBin(pos, pos + len(ref)) sql_query1 += sql_pattern1.format( table, chrm, pos, ref, alt, is_transition(ref, alt), bin, samples_array) sql_query2 += sql_pattern2.format( samples[sn].id, bin, chrm, pos, ref, alt, normalize_gt(s), get_info(s, 'DP')) count += 1 # Import custom annotation for the variant for ann_name, metadata in vcf_metadata[ 'annotations'].items(): if metadata: # By transcript (r.info is a list of annotation. Inside we shall find, transcript and allele information to be able to save data for the current variant) for info in r.info[metadata['flag']]: data = info.split('|') q_fields = [] q_values = [] allele = "" trx_pk = "NULL" for col_pos, col_name in enumerate( metadata['columns']): q_fields.append( metadata['db_map'][col_name]['name']) val = escape_value_for_sql(data[col_pos]) if col_name == 'Allele': allele = val.strip().strip("-") if col_name == metadata['db_pk_field']: trx_pk = val.strip() q_values.append( '\'{}\''.format(val) if val != '' and val is not None else 'NULL') pos, ref, alt = normalize( r.pos, r.ref, s.alleles[0]) # print(pos, ref, alt, allele) if pos is not None and alt == allele: # print("ok") sql_query3 += sql_pattern3.format( metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 pos, ref, alt = normalize( r.pos, r.ref, s.alleles[1]) # print(pos, ref, alt, allele) if pos is not None and alt == allele: # print("ok") sql_query3 += sql_pattern3.format( metadata['table'], ','.join(q_fields), ','.join(q_values), bin, chrm, pos, ref, alt, trx_pk) count += 1 # manage split big request to avoid sql out of memory transaction if count >= 10000: count = 0 # Model.execute_async(transaction1 + transaction2 + transaction3, transaction_end) transaction = sql_query1 + sql_query2 + sql_query3 log("VCF import : Execute async query (as coroutine)") await Model.execute_aio(transaction) # job_id = Model.execute_bw(transaction, transaction_end) # job_in_progress.append(job_id) # log("VCF import : Execute async query, new job_id : {}. Jobs running [{}]".format(job_id, ','.join([job_in_progress]))) # Reset query buffers sql_query1 = "" sql_query2 = "" sql_query3 = "" # Loop done, execute last pending query log("VCF import : Execute last async query (as coroutine)") transaction = sql_query1 + sql_query2 + sql_query3 await Model.execute_aio(transaction) log("VCF import : Done") end = datetime.datetime.now() if core is not None: core.notify_all({ 'msg': 'import_vcf_end', 'data': { 'file_id': file_id, 'msg': 'Import done without error.', 'samples': [{ 'id': samples[s].id, 'name': samples[s].name } for s in samples.keys()] } })
def _mergeAndAddGT(snvvcf, indvcf, outfile): from pysam import VariantFile snv = VariantFile(snvvcf) ind = VariantFile(indvcf) snv.header.info.add('TYPE', 1, 'String', 'Type of somatic mutation') ind.header.info.add('TYPE', 1, 'String', 'Type of somatic mutation') snv.header.info.add('QSI', 1, 'Integer', 'Quality score for any somatic variant, ie. for the ALT haplotype to be present at a significantly different frequency in the tumor and normal') snv.header.info.add('TQSI', 1, 'Integer', 'Data tier used to compute QSI') snv.header.info.add('QSI_NT', 1, 'Integer', 'Quality score reflecting the joint probability of a somatic variant and NT') snv.header.info.add('TQSI_NT', 1, 'Integer', 'Data tier used to compute QSI_NT') snv.header.info.add('IC', 1, 'Integer', 'Number of times RU repeats in the indel allele') snv.header.info.add('IHP', 1, 'Integer', 'Largest reference interrupted homopolymer length intersecting with the indel') snv.header.info.add('OVERLAP', 0, 'Flag', 'Somatic indel possibly overlaps a second indel.') snv.header.info.add('RC', 1, 'Integer', 'Number of times RU repeats in the reference allele') snv.header.info.add('RU', 1, 'String', 'Smallest repeating sequence unit in inserted or deleted sequence') snv.header.formats.add('GT', 1, 'String', 'Possible genotype') ind.header.formats.add('GT', 1, 'String', 'Possible genotype') snv.header.formats.add('BCN50', 1, 'Float', 'Fraction of filtered reads within 50 bases of the indel.') snv.header.formats.add('DP2', 1, 'Integer', 'Read depth for tier2') snv.header.formats.add('DP50', 1, 'Float', 'Average tier1 read depth within 50 bases') snv.header.formats.add('FDP50', 1, 'Float', 'Average tier1 number of basecalls filtered from original read depth within 50 bases') snv.header.formats.add('SUBDP50', 1, 'Float', 'Average number of reads below tier1 mapping quality threshold aligned across sites within 50 bases') snv.header.formats.add('TAR', 2, 'Integer', 'Reads strongly supporting alternate allele for tiers 1,2') snv.header.formats.add('TIR', 2, 'Integer', 'Reads strongly supporting indel allele for tiers 1,2') snv.header.formats.add('TOR', 2, 'Integer', 'Other reads (weak support or insufficient indel breakpoint overlap) for tiers 1,2') contigs = list(snv.header.contigs.keys()) out = open(outfile, 'w') #Can't change sample names with VariantFile #out = VariantFile(outfile, 'w', header = snv.header) headers = str(snv.header).splitlines() cnames = headers[-1].split("\t") cnames [-2] = nprefix cnames [-1] = tprefix headers[-1] = "\t".join(cnames) out.write("\n".join(headers) + "\n") r1 = r2 = None indel_gts = { 'ref': (0, 0), 'het': (0, 1), 'hom': (1, 1) } while True: if not r1: try: r1 = next(snv) r1.info['TYPE'] = 'SNV' alleles = (r1.ref, ) + r1.alts gts = r1.info['SGT'].split('->') try: r1.samples['NORMAL']['GT'] = tuple(sorted(alleles.index(gt) for gt in list(gts[0]))) r1.samples['TUMOR']['GT'] = tuple(sorted(alleles.index(gt) for gt in list(gts[1]))) except ValueError: r1 = None continue except StopIteration: r1 = None if not r2: try: r2 = next(ind) r2.info['TYPE'] = 'INDEL' gts = r2.info['SGT'].split('->') r2.samples['NORMAL']['GT'] = indel_gts[gts[0]] r2.samples['TUMOR']['GT'] = indel_gts[gts[1]] except StopIteration: r2 = None if r1 and r2: if (contigs.index(r1.chrom), r1.pos) < (contigs.index(r2.chrom), r2.pos): out.write(str(r1)) r1 = None else: out.write(str(r2)) r2 = None elif r1: out.write(str(r1)) r1 = None elif r2: out.write(str(r2)) r2 = None else: break out.close()
methylated = mc8[6] # Number Gs unmethylated = mc8[4] # Number As return (methylated, unmethylated) if __name__ == '__main__': parser = argparse.ArgumentParser( description="Takes a list of input files? Or Idrectory...TBD") parser.add_argument("--input_file", default="./101.bcf") parser.add_argument("--output_dir", default="./extract_output/") parser.add_argument("--merge_strands", action="store_true") args = parser.parse_args() infile = VariantFile("101.bcf", threads=4) csv_out_name = args.input_file.replace('.bcf', '.csv') ofile = open(csv_out_name, "w") # Column names for ouptut writer = csv.writer(ofile) writer.writerow([ "chr", "pos", "reference", "call", "methylated", "unmethylated", "strand" ]) # The things in rec.format # GT FT DP MQ GQ QD GL MC8 AMQ CS CG CX # 480 minutes per one bcf--unacceptable!!! # 7 minutes for chrom 22--using 4 threads # 7 minutes for chrom 22--using 8 threads
#!/bin/python3.6 import sys from pysam import VariantFile vcf_in = VariantFile(sys.argv[1]) # dosen't matter if bgziped or not. Automatically recognizes new_header = vcf_in.header # import pdb; pdb.set_trace() new_header.info.add("DP", "1", "Integer", "Sum of AD fields") new_header.info.add("AF", "1", "Float", "Alt AD / sum(AD)") # start new vcf with the new_header vcf_out = VariantFile(sys.argv[2], 'w', header=new_header) for record in vcf_in.fetch(): dp = sum(record.samples[0].get("AD")) record.info["DP"] = dp af = record.samples[0].get("AD")[1]/dp record.info["AF"] = af vcf_out.write(record)
def run_process(opts, inputvcf): outputvcf = opts.output # Open VCF vcf_in = VariantFile(inputvcf) # Add INFO to Header vcf_in.header.info.add( "TYPE", "A", "String", "The type of allele, either snp, ins, del, or complex.") # Add FORMAT to Header vcf_in.header.formats.add( "NGB_DP", "1", "Integer", "Approximate read depth; some reads may have been filtered") vcf_in.header.formats.add("NGB_AO", "A", "Integer", "Alternate allele observation count") vcf_in.header.formats.add("NGB_RO", "1", "Integer", "Reference allele observation count") vcf_in.header.formats.add( "NGB_VAF", "A", "Float", "Allele fractions of alternate alleles in the tumor") # Write VCF vcf_out = VariantFile(outputvcf if outputvcf else '-', 'w', header=vcf_in.header) for record in vcf_in.fetch(): chrom = record.chrom pos = record.pos ref = record.ref alts = record.alts variant_type_list = list() ngb_dp_list = list() ngb_ao_list = list() ngb_ro_list = list() ngb_vaf_list = list() tmp_dp = sum(record.samples[0]['AD']) tmp_ro = record.samples[0]['AD'][0] for n, alt in enumerate(alts): # Get Variant TYPE (freebayes format) ret = ngb_functions.pairdiff(ref, alt) vartype = ret['variant_type'] variant_type_list.append(vartype) # Get DP,AO,RO,VAF tmp_vaf = float(record.samples[0]['AD'][(n + 1)]) / float(tmp_dp) tmp_ao = int(record.samples[0]['AD'][(n + 1)]) ngb_dp_list.append(tmp_dp) ngb_ao_list.append(tmp_ao) ngb_vaf_list.append(tmp_vaf) if variant_type_list != []: #info_value = ','.join(str(e) for e in variant_type_list) record.info['TYPE'] = variant_type_list if ngb_dp_list != []: record.samples[0]["NGB_DP"] = ngb_dp_list[0] record.samples[0]["NGB_AO"] = tuple(ngb_ao_list) record.samples[0]["NGB_RO"] = tmp_ro record.samples[0]["NGB_VAF"] = tuple(ngb_vaf_list) # Write VCF vcf_out.write(record)
def test_read_variant_vcf(self): p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCF) t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(abs((k - np.zeros(50)).max()), 0.0) self.assertEqual(var_name, 'FM211187_16_G_A') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, sorted(['sample_%d' % x for x in range(1, 51)])) self.assertEqual(af, 0.0) self.assertEqual(missing, 0.0) # not providing samples t = read_variant(infile, p, 'vcf', False, [], False, set(), []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) # providing burden burden_regions = deque([]) load_burden(B, burden_regions) t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS1') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0) # providing burden burden_regions = deque([]) load_burden(BM, burden_regions) # last one has multiple regions burden_regions.reverse() t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS3') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0) # uncompressed option - no effect infile = VariantFile(VCF) t = read_variant(infile, p.head(5), 'vcf', False, [], True, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([0, 0, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'FM211187_16_G_A') self.assertEqual(kstrains, []) self.assertEqual(nkstrains, ['sample_1', 'sample_2', 'sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.0) self.assertEqual(missing, 0.0) # different type with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'kmers', False, [], True, p.head(5).index, []) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'Rtab', False, [], False, p.head(5).index, []) # read until exhaustion while not t[0]: t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, True) self.assertEqual(k, None) self.assertEqual(var_name, None) self.assertEqual(kstrains, None) self.assertEqual(nkstrains, None) self.assertEqual(af, None) self.assertEqual(missing, None) # different file infile = gzip.open(KMER) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, []) infile = open(PRES) with self.assertRaises(AttributeError): t = read_variant(infile, p.head(5), 'vcf', False, [], False, p.head(5).index, []) # burden with missing genotypes in last read variant # issue #90 p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCFMISSING) burden_regions = deque([]) load_burden(BMISSING, burden_regions) t = read_variant(infile, p.head(5), 'vcf', True, burden_regions, False, p.head(5).index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(eof, False) self.assertTrue(abs((k - np.array([1, 1, 0, 0, 0])).max()) < 1E-7) self.assertEqual(var_name, 'CDS1') self.assertEqual(kstrains, ['sample_1', 'sample_2']) self.assertEqual(nkstrains, ['sample_3', 'sample_4', 'sample_5']) self.assertEqual(af, 0.4) self.assertEqual(missing, 0) # check that missing variants are properly missed # issue #120 p = pd.read_csv(P, index_col=0, sep='\t')['binary'] infile = VariantFile(VCFMISSING) variant = next(infile) total = 0 missing = 0 samples = set() for sample, call in variant.samples.items(): if sample not in p.index: continue for haplotype in call.get('GT', [None]): if haplotype is None or haplotype == '.': missing += 1 total += 1 samples.add(sample) pysam_missing = missing / float(total) infile = VariantFile(VCFMISSING) t = read_variant(infile, p, 'vcf', False, [], False, p.index, []) eof, k, var_name, kstrains, nkstrains, af, missing = t self.assertEqual(pysam_missing, missing)
def main(self, args): command.Command.main(self, args) self.validate(args) for i in [1, 2]: attr = "pop%d" % i pid, ary = getattr(args, attr) if len(ary) == 1 and ary[0][0] == "@": setattr(args, attr, SampleList( pid, open(ary[0][1:], "rt").read().strip().split("\n"))) pop_d = dict([args.pop1, args.pop2]) for pid in pop_d: if pop_d[pid]: c = Counter(pop_d[pid]) if max(c.values()) > 1: raise RuntimeError( "Population %s has duplicated samples: %s" % (pid, [item for item in c.items() if item[1] > 1])) dist = [[], []] if not args.d: first_sid = args.pop1.samples[0] args.d = [first_sid] * 2 args.d = [args.d[0] + ":0", args.d[1] + ":1"] all_samples = set(args.pop1.samples) | set(args.pop2.samples) for sid_i in args.d: sid, i = sid_i.split(":") i = int(i) if sid not in all_samples: raise RuntimeError("%s is not in the sample list" % sid) if sid in args.pop1.samples: d = dist[0] else: assert sid in args.pop2.samples d = dist[1] d.append((sid, i)) undist = [[(k, i) for k in p.samples for i in (0, 1) if (k, i) not in d] for p, d in zip((args.pop1, args.pop2), dist)] npop = 1 def print_pop(i): logger.info("Population %d:" % i) logger.info("Distinguished lineages: " + ", ".join("%s:%d" % t for t in dist[i - 1])) logger.info("Undistinguished lineages: " + ", ".join("%s:%d" % t for t in undist[i - 1])) print_pop(1) if args.pop2.pid is not None: npop = 2 common = set(args.pop1.samples) & set(args.pop2.samples) if common: logger.error("Populations 1 and 2 should be disjoint, " "but both contain " + ", ".join(common)) sys.exit(1) print_pop(2) # Start parsing vcf = VariantFile(args.vcf) with optional_gzip(args.out, "wt") as out: samples = list(vcf.header.samples) dist = dist[:npop] undist = undist[:npop] if not set([dd[0] for d in dist for dd in d]) <= set(samples): raise RuntimeError("Distinguished lineages not found in data?") missing = [s for u in undist for s, _ in u if s not in samples] if missing: msg = "The following samples were not found in the data: %s. " % ", ".join( missing) if args.ignore_missing: logger.warn(msg) else: msg += "If you want to continue without these samples, use --ignore-missing." raise RuntimeError(msg) undist = [[t for t in u if t[0] not in missing] for u in undist] # Write header pids = [a.pid for a in (args.pop1, args.pop2)[:npop]] out.write("# SMC++ ") json.dump({"version": version, "pids": pids, "undist": undist, "dist": dist}, out) out.write("\n") na = list(map(len, dist)) nb = list(map(len, undist)) # function to convert a VCF record to our format: # <span, dist gt, # undist gt, # undist, [...]> def rec2gt(rec): ref = rec.alleles[0] da = [[rec.samples[d].alleles[i] for d, i in di] for di in dist] a = [sum([x != ref for x in d]) if None not in d else -1 for d in da] bs = [[rec.samples[d].alleles[i] != ref for d, i in un if rec.samples[d].alleles[i] is not None] for un in undist] b = [sum(_) for _ in bs] nb = [len(_) for _ in bs] # Fold non-polymorphic (in subsample) sites if np.array_equal(b, nb) and np.array_equal(a, na): a = [0] * len(a) b = [0] * len(b) return list(sum(zip(a, b, nb), tuple())) try: region_iterator = vcf.fetch(contig=args.contig) except ValueError as e: logger.error("VCF reader threw an error: %s", e) logger.error("Make sure the VCF is indexed:") logger.error("") logger.error(" $ tabix %s", args.vcf) logger.error("") sys.exit(1) contig_length = args.length or vcf.header.contigs[args.contig].length if contig_length is None: logger.error("Failed to acquire contig length from VCF header. See the --length option.") sys.exit(1) if args.mask: mask_iterator = TabixFile( args.mask).fetch(reference=args.contig) args.missing_cutoff = np.inf else: mask_iterator = iter([]) if args.missing_cutoff is None: args.missing_cutoff = np.inf mask_iterator = (x.split("\t") for x in mask_iterator) mask_iterator = ((x[0], int(x[1]), int(x[2])) for x in mask_iterator) snps_only = ( rec for rec in region_iterator if len(rec.alleles) <= 2 and all(len(a) == 1 for a in rec.alleles) ) def interleaved(): cmask = next(mask_iterator, None) csnp = next(snps_only, None) while cmask or csnp: if cmask is None: yield "snp", csnp csnp = next(snps_only, None) elif csnp is None: yield "mask", cmask cmask = next(mask_iterator, None) else: if csnp.pos < cmask[1]: yield "snp", csnp csnp = next(snps_only, None) elif csnp.pos <= cmask[2]: while csnp is not None and csnp.pos <= cmask[2]: csnp = next(snps_only, None) yield "mask", cmask cmask = next(mask_iterator, None) else: yield "mask", cmask cmask = next(mask_iterator, None) abnb_miss = [-1, 0, 0] * len(nb) abnb_nonseg = sum([[0, 0, x] for x in nb], []) multiples = set() with RepeatingWriter(out) as rw, \ tqdm.tqdm(total=contig_length, unit='bases', unit_scale=True) as bar: def write(x): if not write.first or not args.drop_first_last: rw.write(x) write.first = False write.first = True last_pos = 0 for ty, rec in interleaved(): if ty == "mask": span = rec[1] - last_pos write([span] + abnb_nonseg) write([rec[2] - rec[1] + 1] + abnb_miss) last_pos = rec[2] continue bar.update(rec.pos - last_pos) abnb = rec2gt(rec) if rec.pos == last_pos: multiples.add(rec.pos) continue span = rec.pos - last_pos - 1 if 1 <= span <= args.missing_cutoff: write([span] + abnb_nonseg) elif span > args.missing_cutoff: write([span] + abnb_miss) write([1] + abnb) last_pos = rec.pos if not args.drop_first_last: write([contig_length - last_pos] + abnb_nonseg) if multiples: # FIXME: what to do with multiple records at same site logger.warn( "Multiple entries found at %d positions; skipped all but the first", len(multiples))
import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from pysam import VariantFile quals = [record.qual for record in VariantFile(everclear.input[0])] plt.hist(quals) plt.savefig(everclear.output[0])
from labels import SVRecord_generic from collections import defaultdict import brewer2mpl import matplotlib.patches as patches windowsizes = [] windowsizes_by_caller = {} windowsizes_by_SVType = defaultdict(list) SVCount_bytype = defaultdict(int) callers = [] lost_SVs = 0 total_SVs = 0 for vcf_file in os.listdir('../MinorResearchInternship/VCF'): vcf_in = VariantFile('../MinorResearchInternship/VCF/' + vcf_file, 'r') caller = re.findall(r'^\w*', vcf_file) callers += [caller[0]] windowsizes_by_caller[caller[0]] = { "CI_sizes": { "Start": { "DEL": [], "INS": [], "BND": [], "INV": [], "DUP": [] }, "End": { "DEL": [], "INS": [], "BND": [],
def Main(): parser = argparse.ArgumentParser( description="loading vcf and interaction files") parser.add_argument("interactionfile", help="Interaction calls from HiCap method") parser.add_argument( "vcfile", help="Variant calls from either HiCap or sequencing samples") parser.add_argument("-o", "--output", help="output of interaction files", action='store', default=None) args = parser.parse_args() Vcfin = VariantFile(args.vcfile) result_title = [ "RefSeqName", "TranscriptName", "Feature_ID", "Feature_Chr", "Feature_Start", "Feature_End", "Annotation", "Strand", "Interactor_Chr", "Interactor_Start", "Interactor_End", "Distance", "SNPs", "SNP_ID", "Ind_count", "Swed_Freq", "TAV2431", "TAV2515", "TAV2709", "BAV2375", "BAV2424", "BAV2714" ] with open(args.output, "w") as output_file: output_file.write("\t".join(result_title) + "\n") with open(args.interactionfile, 'r') as f: next(f) for line in f: line = line.strip().split("\t") all_fields = line[0], line[1], line[2], line[3], line[4], line[ 5], line[6], line[7], line[8], line[9], line[10], line[11] chr = ((line[8])[3:], line[9], line[10]) TAV2431 = [line[12], line[13]] TAV2515 = [line[15], line[16]] TAV2709 = [line[18], line[19]] BAV2375 = [line[21], line[22]] BAV2424 = [line[24], line[25]] BAV2714 = [line[27], line[28]] interaction_sample = [ TAV2431, TAV2515, TAV2709, BAV2375, BAV2424, BAV2714 ] interaction_binary = int2binary(interaction_sample) sample_list = [3, 4, 5, 0, 1, 2] for rec in Vcfin.fetch(chr[0], int(chr[1]), int(chr[2])): genotype_binary = [] for test in rec.samples.values(): genotype = "/".join([str(x) for x in test["GT"]]) if genotype == "None/None": continue elif genotype == "0/1" or genotype == "1/1": genotype_binary.append("1") elif genotype == "0/0": genotype_binary.append("0") swed_freq = "0" for f, v in rec.info.iteritems(): if pattern.match(f): swed_freq = v if rec.id == None: rec.id = "X" sorted_genotype = [ x for _, x in sorted(zip(sample_list, genotype_binary)) ] zip_array = list(zip(interaction_binary, sorted_genotype)) count = 0 for a, b in zip_array: if a == b: count = count + 1 if count == 6: allele = "|".join(rec.alleles) count_int_allele = 0 for a, b in zip_array: if (a, b) == ('1', '1'): count_int_allele = count_int_allele + 1 changed_freq = "".join(str(x) for x in swed_freq) unzip_array = ["|".join(x) for x in zip_array] snp = (line[8], rec.start, rec.stop, allele, rec.filter.keys()[0]) str_snp = "_".join(str(x) for x in snp) result = "\t".join( all_fields ), str_snp, rec.id, count_int_allele, changed_freq, "\t".join( unzip_array) combined_result = "\t".join(str(x) for x in result) with open(args.output, "a") as output_file: output_file.write(combined_result + "\n")
def match_replicates(args): """Match a genome against another presumably identical genome (i.e. replicates).""" refs = Fastafile(expanduser(args.reference)) in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = make_outputs(in_vars, args.out1, args.out2) match_status_map = {True: '=', False: 'X', None: '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [ sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci ] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-' * 80) print(f'{chrom}:[{super_start:d}-{super_stop:d}):') print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.ref or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join( locus.alleles[a] or '-' if a is not None else '.' for a in indices) print( f' NORM{i:d}: [{lstart:5d}-{lstop:5d}) ref={lref} geno={geno}' ) print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(f' MATCH={match_status} TYPE={match_type}') print() write_match(out_vars[0], super1, args.name1, match_status, match_type) write_match(out_vars[1], super2, args.name2, match_status, match_type) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(f' VCF{i:d}: {locus.record}', end='') print() for out_var in out_vars: if out_var is not None: out_var.close()
import pysam import sys from pysam import VariantFile bcf_in = VariantFile('-') # auto-detect input format ''' print('\naaa\n') print(dir(bcf_in.header)) for k, v in bcf_in.header.formats.items(): print('{}\t{}'.format(k, v)) print('\t{}'.format(dir(v))) print('\t{}'.format(v.name)) print('\t{}'.format(v.number)) print('\t{}'.format(v.type)) print('\t{}'.format(v.record)) print('\t{}'.format(v.id)) print('\t{}'.format(v.description)) print('\nbbb\n') print(bcf_in.header.formats) ''' bcf_in.header.add_line( '##FORMAT=<ID=NonHomrefQ,Number=1,Type=Integer,Description=\"Likelihood of the homozygous-reference genotype\">' ) bcf_out = VariantFile('-', 'w', header=bcf_in.header) sample = bcf_in.header.samples[0] for rec in bcf_in: assert rec.samples[sample]['GL4'][ 0] != None, 'The record {} is invalid!'.format(rec)
def setup(cls, source): curr = cls(source) curr.f = VariantFile(curr.source) return curr
def add_freqs(): vcf_path = sys.argv[2] fai_path = sys.argv[3] min_samples = int(sys.argv[4]) if len(sys.argv) == 5 else 0 vcf = VariantFile(vcf_path, 'r', drop_samples=False) ref_name, ref_len = open(fai_path).readlines()[0].strip('\n').split( '\t')[0:2] new_contig = f"##contig=<ID={ref_name},length={ref_len}>" vcf.header.add_line(new_contig) vcf.header.add_line( "##INFO=<ID=AF,Number=A,Type=Float,Description=\"Estimated allele frequency in the range (0,1)\">" ) print('\n'.join(str(vcf.header).split('\n')[:-1])) removed_variants = 0 removed_some_alt = 0 tot_removed_alleles = 0 tot_removed_genotypes = 0 next_milestone = 1000 for record in vcf: if record.pos <= 50 or record.pos >= int(ref_len) - 50: continue if record.pos > next_milestone: print("Reached position", record.pos, file=sys.stderr) next_milestone += 1000 n_gts = defaultdict(int) low_samples = defaultdict(list) for sample in record.samples.itervalues(): curr_gt = sample.allele_indices[0] n_gts[curr_gt] += 1 if curr_gt != 0 and n_gts[curr_gt] < min_samples: low_samples[curr_gt].append(sample) to_delete = set([gt for gt in n_gts if n_gts[gt] < min_samples]) if len(to_delete) > 0: if len(n_gts) - len(to_delete) == 1: removed_variants += 1 continue tot_removed_alleles += len(to_delete) removed_some_alt += 1 for gt in to_delete: for sample in low_samples[gt]: sample.allele_indices = (0, ) n_gts[0] += n_gts[gt] tot_removed_genotypes += n_gts[gt] n_gts[gt] = 0 tot_alleles = len(n_gts) tot_samples = sum(n_gts.values()) for gt in n_gts: n_gts[gt] /= tot_samples n_gts[gt] = round(n_gts[gt], 6) alt_freqs = [n_gts[i] for i in n_gts if i > 0] record.chrom = ref_name record.info.__setitem__("AF", alt_freqs) print(record, end='', flush=False) print("removed_variants=", removed_variants, file=sys.stderr) print("removed_some_alt=", removed_some_alt, file=sys.stderr) print("tot_removed_alleles=", tot_removed_alleles, file=sys.stderr) print("tot_removed_genotypes=", tot_removed_genotypes, file=sys.stderr)
def force_calling(bam_path, ivcf_path, output_path, sigs_dir, max_cluster_bias_dict, threshold_gloab_dict, gt_round, threads): logging.info('Check the parameter -Ivcf: OK.') logging.info('Enable to perform force calling.') #print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) sv_dict = dict() #''' for sv_type in ["DEL", "DUP"]: sv_dict[sv_type] = parse_sigs(sv_type, sigs_dir) sv_dict['INS'] = parse_inssigs(sigs_dir) sv_dict['INV'] = parse_invsigs(sigs_dir) sv_dict['TRA'] = parse_trasigs(sigs_dir) #''' vcf_reader = VariantFile(ivcf_path, 'r') row_count = 0 for record in vcf_reader.fetch(): row_count += 1 idx = -1 #gt_list = Manager().list([[] for x in range(row_count)]) gt_list = list() result = [] process_pool = Pool(processes=threads) vcf_reader = VariantFile(ivcf_path, 'r') for record in vcf_reader.fetch(): idx += 1 sv_type, chrom, sv_chr2, pos, sv_end, sv_strand = parse_record(record) if sv_type not in ["DEL", "INS", "DUP", "INV", "TRA"]: continue search_id_list = [] if sv_type == 'TRA' and 'TRA' in sv_dict and chrom in sv_dict[ 'TRA'] and sv_chr2 in sv_dict['TRA'][chrom]: search_id_list = sv_dict['TRA'][chrom][sv_chr2] elif sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict['INV']: if sv_strand in sv_dict['INV'][chrom]: search_id_list = sv_dict['INV'][chrom][sv_strand] else: for strand_iter in sv_dict['INV'][chrom]: sv_strand = strand_iter search_id_list = sv_dict['INV'][chrom][strand_iter] break elif sv_type != 'TRA' and sv_type != 'INV' and sv_type in sv_dict and chrom in sv_dict[ sv_type]: search_id_list = sv_dict[sv_type][chrom] max_cluster_bias = 0 if sv_type == 'INS' or sv_type == 'DEL': read_id_list, max_cluster_bias, indel_seq, CIPOS, CILEN = find_in_indel_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end, threshold_gloab_dict[sv_type]) else: read_id_list, max_cluster_bias = find_in_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end) CIPOS = '.,.' CILEN = '.,.' if sv_type == 'INV' and 'INV' in sv_dict and chrom in sv_dict[ 'INV'] and len(read_id_list) == 0: for strand_iter in sv_dict['INV'][chrom]: if strand_iter != sv_strand: search_id_list = sv_dict['INV'][chrom][strand_iter] read_id_list, max_cluster_bias = find_in_list( sv_type, search_id_list, max_cluster_bias_dict[sv_type], pos, sv_end) if len(read_id_list) != 0: sv_strand = strand_iter break #print(read_id_list) if sv_type == 'INS': max_cluster_bias = max(1000, max_cluster_bias) else: max_cluster_bias = max(max_cluster_bias_dict[sv_type], max_cluster_bias) para = Para(record, CIPOS, CILEN) ''' if sv_type == 'INS': fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INS')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'DEL': fx_para = [([bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DEL')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'INV': fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'INV')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'DUP': fx_para = [([bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'DUP')] gt_list.append(call_gt_wrapper(fx_para)) if sv_type == 'TRA': fx_para = [([bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round], idx, row_count, para, sv_strand, 'TRA')] gt_list.append(call_gt_wrapper(fx_para)) ''' #''' if sv_type == 'INS': fx_para = [([ bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, indel_seq, 'INS')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'DEL': fx_para = [([ bam_path, pos, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<DEL>', 'DEL')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'INV': fx_para = [([ bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<INV>', 'INV')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'DUP': fx_para = [([ bam_path, pos, sv_end, chrom, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<DUP>', 'DUP')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) if sv_type == 'TRA': fx_para = [([ bam_path, pos, sv_end, chrom, sv_chr2, read_id_list, max_cluster_bias, gt_round ], idx, row_count, para, sv_strand, '<TRA>', 'TRA')] gt_list.append(process_pool.map_async(call_gt_wrapper, fx_para)) #''' process_pool.close() process_pool.join() semi_result = list() for item in gt_list: try: semi_result.append(item.get()[0]) except: pass logging.info('Finished force calling.') return semi_result
"""Extract reference (FASTA) and sample names from the VCF file.""" import argparse import os from pysam import VariantFile from resolwe_runtime_utils import error, warning parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("vcf_file", help="VCF file (can be compressed using gzip/bgzip).") parser.add_argument("summary", help="Summary file to append to.") args = parser.parse_args() try: vcf = VariantFile(args.vcf_file) except (OSError, ValueError) as error_msg: proc_error = "Input VCF file does not exist or could not be correctly opened." print(error(proc_error)) raise ValueError(error_msg) vcf_header = vcf.header header_records = {record.key: record.value for record in vcf_header.records} with open(args.summary, "a") as out_file: try: fasta_name = os.path.basename(header_records["reference"]) except KeyError: fasta_name = "" print( warning(
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') parser.add_argument('--contigs', type=argparse.FileType('r'), help='Reference fasta index (.fai). If provided, ' 'contigs in index will be used in VCF header. ' 'Otherwise all GRCh37 contigs will be used in header. ' 'Variants on contigs not in provided list will be ' 'removed.') parser.add_argument('--min-size', type=int, default=50, help='Minimum SV size to report [50].') parser.add_argument('--call-null-sites', action='store_true', default=False, help='Call sites with null genotypes (./.). Generally ' 'useful when an algorithm has been run on a single ' 'sample and has only reported variant sites.') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Add contigs to header if provided if args.contigs: template = pkg_resources.resource_filename( 'svtk', 'data/no_contigs_template.vcf') template = VariantFile(template) header = template.header contig_line = '##contig=<ID={contig},length={length}>' for line in args.contigs: contig, length = line.split()[:2] header.add_line(contig_line.format(**locals())) # Use GRCh37 by default else: template = pkg_resources.resource_filename('svtk', 'data/GRCh37_template.vcf') template = VariantFile(template) header = template.header vcf = VariantFile(args.vcf) # Template header includes all necessary FILTER, INFO, and FORMAT fields # Just need to add samples from VCF being standardized for sample in vcf.header.samples: header.add_sample(sample) # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout, args.prefix, args.min_size, args.include_reference_sites, args.call_null_sites) for record in standardizer.standardize_vcf(): fout.write(record) fout.close() vcf.close()
def match_replicates(args): # Load FASTA reference refs = Fastafile(expanduser(args.reference)) # Open input variant files in_vars = [VariantFile(var) for var in [args.vcf1, args.vcf2]] out_vars = [None, None] if args.out1: in_vars[0].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[0].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[0] = VariantFile(args.out1, 'w', header=in_vars[0].header) if args.out2: in_vars[1].header.formats.add('BD', '1', 'String', 'Match decision for call (match: =, mismatch: X, error: N)') in_vars[1].header.formats.add('BK', '1', 'String', 'Sub-type for match decision (trivial: T, haplotype: H, error: N)') out_vars[1] = VariantFile(args.out2, 'w', header=in_vars[1].header) match_status_map = {True : '=', False : 'X', None : '.'} # Create parallel locus iterator by chromosome for chrom, ref, loci in records_by_chromosome(refs, in_vars, [args.name1, args.name2], args): # Create superloci by taking the union of overlapping loci across all of the locus streams loci = [sort_almost_sorted(l, key=NormalizedLocus.extreme_order_key) for l in loci] superloci = union(loci, interval_func=attrgetter('min_start', 'max_stop')) # Proceed by superlocus for _, _, (super1, super2) in superloci: super1.sort(key=NormalizedLocus.natural_order_key) super2.sort(key=NormalizedLocus.natural_order_key) super_start, super_stop = get_superlocus_bounds([super1, super2]) print('-'*80) print('{}:[{:d}-{:d}):'.format(chrom, super_start, super_stop)) print() for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: lstart = locus.start lstop = locus.stop lref = locus.alleles[0] or '-' indices = locus.allele_indices sep = '|' if locus.phased else '/' geno = sep.join(locus.alleles[a] or '-' if a is not None else '.' for a in indices) print(' NORM{:d}: [{:5d}-{:5d}) ref={} geno={}'.format(i, lstart, lstop, lref, geno)) print() match, match_type = superlocus_equal(ref, super_start, super_stop, super1, super2, debug=args.debug) match_status = match_status_map[match] print(' MATCH={} TYPE={}'.format(match_status, match_type)) print() # The hard work is done. The rest is just output and formatting... if out_vars[0]: for locus in sorted(super1, key=NormalizedLocus.record_order_key): locus.record.samples[args.name1]['BD'] = match_status locus.record.samples[args.name1]['BK'] = match_type out_vars[0].write(locus.record) if out_vars[1]: for locus in sorted(super2, key=NormalizedLocus.record_order_key): locus.record.samples[args.name2]['BD'] = match_status locus.record.samples[args.name2]['BK'] = match_type out_vars[1].write(locus.record) for i, superlocus in enumerate([super1, super2], 1): for locus in superlocus: print(' VCF{:d}: {}'.format(i, locus.record), end='') print() for out_var in out_vars: if out_var is not None: out_var.close()
#=========================================================================# # Script: plot-quals.py # #-------------------------------------------------------------------------# # Generates a histogram of the quality scores based on the variant calls # # in calls/all.vcf. # #=========================================================================# import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt from pysam import VariantFile quals = [record.qual for record in VariantFile(snakemake.input[0])] plt.hist(quals) plt.savefig(snakemake.output[0])
def main(): args = process_input() chrom_vcf = args.chrom_vcf min_r2 = args.min_r2 min_maf = args.min_maf out_prefix = args.out_prefix r2_field_name = args.r2_field_name maf_field_name = args.maf_field_name new_ids = args.new_ids #### # Read new ids in dictionary #### new_ids_dict = dict() if new_ids is not None: with open(new_ids, "r") as f: for line in f: old_id, new_id = line.rstrip().split("\t") new_ids_dict[old_id] = new_id print "Ids {0} ids to remap".format(len(new_ids_dict)) out_vcf_list = "{0}.vcf_list.tsv".format(out_prefix) out_vcf_list_handle = open(out_vcf_list, "w") for chrom, vcf in chrom_vcf.iteritems(): chrom_match = re.match("(chr)?(.+)", chrom) if chrom_match is not None: chrom = chrom_match.group(2) else: raise ValueError( "Chomosome name {0} not formatted correctly!".format(chrom)) out_vcf_name = "{0}.chr{1}.vcf".format(out_prefix, chrom) out_vcf_name_gz = "{0}.chr{1}.vcf.gz".format(out_prefix, chrom) out_vcf_name_gz_tbi = "{0}.chr{1}.vcf.gz.tbi".format(out_prefix, chrom) print "Processing chr{0} {1}...".format(chrom, vcf) in_vcf_handle = VariantFile(vcf) pass_filter = in_vcf_handle.header.filters["PASS"] out_vcf_list_handle.write("{0}\t{1}".format(chrom, out_vcf_name_gz)) out_vcf_list_handle.write("\n") #### # It appears that writing to a BCF is the only method that works in this version of pysam #### #'wb' for BCF # #out_vcf_handle = VariantFile(out_vcf_name,'wb',header=in_vcf_handle.header) #out_vcf_handle = pysam.libcbgzf.BGZFile(out_vcf_name,"wb") #out_vcf_handle.write(str(in_vcf_handle.header)) #cmd = "bgzip -c > {0}".format(out_vcf_name) #print cmd out_vcf_handle = open(out_vcf_name, "w") print "Relabeling and writing header..." relabeled_ids = 0 old_header_lines = str(in_vcf_handle.header).split("\n") for line in old_header_lines: if line == "": continue if re.match("^#CHROM.+", line): cols = line.split("\t") for i in range(9, len(cols)): if cols[i] in new_ids_dict: relabeled_ids += 1 cols[i] = new_ids_dict[cols[i]] #merge new columns new_line = "\t".join(cols) out_vcf_handle.write(new_line) else: out_vcf_handle.write(line) #write new line out_vcf_handle.write("\n") print "Relabeled {0} ids".format(relabeled_ids) rec_count = 0 for rec in in_vcf_handle: rec_count += 1 if rec_count % 50000 == 0: print "Line: {0:d} {1}:{2:d}".format(rec_count, rec.chrom, rec.pos) r2 = rec.info[r2_field_name] maf = rec.info[maf_field_name] if r2 > min_r2 and maf > min_maf: #clear filters rec.filter.clear() #set filter to be pass rec.filter.add("PASS") #new lines are already there out_vcf_handle.write(str(rec)) #print "Running bgzip on " ##execute bgzip #bgz_handle = Popen(["bgzip", out_vcf_name]) #bgz_handle.wait() in_vcf_handle.close() out_vcf_handle.close() print "Writing tabix index for {0}...".format(out_vcf_name, preset="vcf") #seems to only compress files pysam.tabix_index(out_vcf_name, preset="vcf") if not os.path.isfile(out_vcf_name_gz_tbi): pysam.tabix_index(out_vcf_name_gz, preset="vcf") if os.path.isfile(out_vcf_name): os.remove(out_vcf_name) out_vcf_list_handle.close() print "Finished writing {0}".format(out_vcf_list) print "Complete!"
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk vcfcluster', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('filelist', type=argparse.FileType('r'), help='List of paths to standardized VCFS') parser.add_argument('fout', help='Clustered VCF.') parser.add_argument('-r', '--region', default=None, help='Restrict clustering to genomic region.') parser.add_argument('-d', '--dist', type=int, default=500, help='Maximum clustering distance. Suggested to use ' 'max of median + 7*MAD over samples. [500]') parser.add_argument('-f', '--frac', type=float, default=0.1, help='Minimum reciprocal overlap between variants. ' '[0.1]') parser.add_argument('-x', '--blacklist', metavar='BED.GZ', type=TabixFile, default=None, help='Tabix indexed bed of blacklisted regions. Any ' 'SV with a breakpoint falling inside one of these ' 'regions is filtered from output.') parser.add_argument('-z', '--svsize', type=int, default=500, help='Minimum SV size to report for intrachromosomal ' 'events. [0]') parser.add_argument('-p', '--prefix', default='MERGED', help='Prefix for merged variant IDs. [MERGED]') parser.add_argument('-t', '--svtypes', default='DEL,DUP,INV,BND', help='Comma delimited list of svtypes to cluster ' '[DEL,DUP,INV,BND]') parser.add_argument('--ignore-svtypes', action='store_true', default=False, help='Ignore svtypes when clustering.') parser.add_argument('-o', '--sample-overlap', type=float, default=0.0, help='Minimum sample overlap for two variants to be ' 'clustered together.') parser.add_argument('--preserve-ids', action='store_true', default=False, help='Include list of IDs of constituent records in ' 'each cluster.') parser.add_argument('--preserve-genotypes', action='store_true', default=False, help='In a set of clustered variants, report best ' '(highest GQ) non-reference genotype when available.') parser.add_argument('--preserve-header', action='store_true', default=False, help='Use header from clustering VCFs') parser.add_argument( '--skip-merge', action='store_true', default=False, help='Do not merge clustered records. Adds CLUSTER info fields.') parser.add_argument( '--merge-only', action='store_true', default=False, help= 'When run on a vcf generated with --skip-merge, only merges records ' 'with identical CLUSTER fields.') parser.add_argument( '--single-end', action='store_true', default=False, help='Require only one end to be within the minimum distance.') # parser.add_argument('--cluster-bed', type=argparse.FileType('w'), # help='Bed of constituent calls in each cluster') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.skip_merge and args.merge_only: raise ValueError('Cannot use both --skip-merge and --merge-only') # Parse SV files and lists of samples and sources filepaths = [line.strip() for line in args.filelist.readlines()] vcfs = parse_filepaths(filepaths) svtypes = args.svtypes.split(',') match_svtypes = not args.ignore_svtypes do_merge = not args.skip_merge do_cluster = not args.merge_only svc = VCFCluster(vcfs, dist=args.dist, blacklist=args.blacklist, frac=args.frac, svtypes=svtypes, region=args.region, match_svtypes=match_svtypes, preserve_ids=args.preserve_ids, preserve_genotypes=args.preserve_genotypes, sample_overlap=args.sample_overlap, preserve_header=args.preserve_header, do_cluster=do_cluster, do_merge=do_merge, single_end=args.single_end) # Open new file if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = open(args.fout, 'w') fout = VariantFile(fout, mode='w', header=svc.header) for i, cluster in enumerate(svc.cluster()): if args.prefix: cluster_id = [args.prefix] else: cluster_id = ['SV'] if args.region: chrom = args.region.split(':')[0] cluster_id.append(chrom) if do_merge and do_cluster: cluster_index = i else: cluster_index = cluster[0].info['CLUSTER'] cluster_id.append(str(cluster_index + 1)) cluster_id = '_'.join(cluster_id) for record in cluster: # Name record if do_merge: name = cluster_id else: name = record.id record.id = name fout.write(record) # Size filter (CTX have size -1) if -1 < record.info['SVLEN'] < args.svsize: continue # if args.cluster_bed is not None: # flatten_pos(cluster, record.ID, args.cluster_bed) fout.close()
def create_matrix(self, src_file_name): ''' Функция создания одной LD-матрицы. ''' #Считывание исходной таблицы, извлечение оттуда #rsIDs и создание словаря, в котором позиции и #идентификаторы вариантов разбиты по хромосомам. data_by_chrs = create_src_dict(self.src_dir_path, src_file_name, self.meta_lines_quan, self.intgen_convdb_path) #В одну папку второго уровня планируется размещать все #результаты, полученные по данным одного исходного файла. src_file_base = src_file_name.rsplit('.', maxsplit=1)[0] trg_dir_path = os.path.join(self.trg_top_dir_path, f'{src_file_base}_LD_matr') #Для вариантов одной хромосомы #создастся одна матрица. for chrom in data_by_chrs: #Проверяем, набралось ли хотя #бы 2 варианта, относящиеся к #текущей хромосоме. Если да, то #появляется смысл в реальном создании #конечной папки второго уровня. if len(data_by_chrs[chrom]) < 2: continue if os.path.exists(trg_dir_path) == False: os.mkdir(trg_dir_path) #Чтобы потом проще было визуально оценивать #влияние физического расстояния на LD, #rsIDs отсортируются по геномным позициям. data_by_chrs[chrom].sort(key=lambda row: row[0]) poss_srtd, rs_ids_srtd = [], [] for row in data_by_chrs[chrom]: poss_srtd.append(row[0]) rs_ids_srtd.append(row[1]) #Знание количества rsIDs в #ближайшей перспективе пригодится, #чтобы задать размеры матрицы, #а в дальнейшей - чтобы #оформить её табличную версию. vars_quan = len(rs_ids_srtd) #Основой текстовой или #графической матрицы #будет двумерный массив #такой структуры: ''' 0 0 0 ... val 0 0 ... val val 0 ... ... ... ... ... ''' #Построение шаблона квадратного двумерного массива, состоящего #из нулей. Нули в дальнейшем могут заменяться на значения LD. ld_two_dim = [[0 for col_index in range(vars_quan)] for row_index in range(vars_quan)] #В случае, если будет рисоваться диаграмма, #такой же шаблон понадобится для создания #матрицы сопутствующей информации. if self.matrix_type in ['heatmap', 'both']: info_two_dim = copy.deepcopy(ld_two_dim) #Для расчёта LD и аннотирования вариантов #потребуются данные проекта 1000 Genomes. #Открываем соответствующий текущей хромосоме #tabix-индексированный 1000 Genomes-архив с #помощью pysam. Pysam тут пригождается для #быстрого доступа к случайным строкам архива. with VariantFile( os.path.join(self.intgen_dir_path, f'{chrom}.vcf.gz')) as intgen_vcf_opened: #Перебор индексов строк и столбцов #изначально нулевых матриц. for row_index in range(vars_quan): for col_index in range(vars_quan): #Матрица, в принципе, может #быть квадратом, состоящим #из двух одинаковых по форме #и содержимому прямоугольных #треугольников, разделённых #диагональю 0-ячеек. Думаю, #разумнее оставить лишь один #из этих треугольников. Для #этого получаем только те #значения, которые соответствуют #ячейкам двумерного массива, #индекс строки которых #больше индекса столбца. if row_index <= col_index: continue #Вытаскивание из 1000 Genomes и отбор #по сэмплам фазированных генотипов текущей #пары вариантов. Разбиение пар генотипов #на отдельные, что необходимо из-за требования #калькулятора LD. Извлечение из 1000 Genomes #аннотаций каждого варианта обрабатываемой пары. y_var_genotypes, x_var_genotypes = [], [] y_var_row = data_by_chrs[chrom][row_index] for intgen_rec in intgen_vcf_opened.fetch( chrom, y_var_row[0] - 1, y_var_row[0]): if intgen_rec.id != y_var_row[1]: continue y_var_alleles = intgen_rec.ref + '/' + intgen_rec.alts[ 0] y_var_type = intgen_rec.info['VT'][0] for sample_name in self.sample_names: try: y_var_genotypes += intgen_rec.samples[ sample_name]['GT'] except KeyError: continue break x_var_row = data_by_chrs[chrom][col_index] for intgen_rec in intgen_vcf_opened.fetch( chrom, x_var_row[0] - 1, x_var_row[0]): if intgen_rec.id != x_var_row[1]: continue x_var_alleles = intgen_rec.ref + '/' + intgen_rec.alts[ 0] x_var_type = intgen_rec.info['VT'][0] for sample_name in self.sample_names: try: x_var_genotypes += intgen_rec.samples[ sample_name]['GT'] except KeyError: continue break #Обращение к оффлайн-калькулятору #для получения словаря с r2, D' и #частотами альтернативных аллелей #пары вариантов для выбранных #исследователем популяций и полов. trg_vals = calc_ld(y_var_genotypes, x_var_genotypes) #Каждый элемент визуализируемой матрицы #аннотируется: параллельно с накоплением #массива LD-значений растёт массив дополнительной #информации по каждой паре вариантов. if self.matrix_type in ['heatmap', 'both']: info_two_dim[row_index][col_index] = f''' r2: {trg_vals["r_square"]}<br> D': {trg_vals["d_prime"]}<br> abs_dist: {abs(poss_srtd[col_index] - poss_srtd[row_index])}<br><br> {rs_ids_srtd[col_index]}.hg38_pos: {poss_srtd[col_index]}<br> {rs_ids_srtd[row_index]}.hg38_pos: {poss_srtd[row_index]}<br><br> {rs_ids_srtd[col_index]}.alleles: {x_var_alleles}<br> {rs_ids_srtd[row_index]}.alleles: {y_var_alleles}<br><br> {rs_ids_srtd[col_index]}.type: {x_var_type}<br> {rs_ids_srtd[row_index]}.type: {y_var_type}<br><br> {rs_ids_srtd[col_index]}.alt_freq: {trg_vals['var_2_alt_freq']}<br> {rs_ids_srtd[row_index]}.alt_freq: {trg_vals['var_1_alt_freq']} ''' #Исследователь мог установить нижний порог LD. #Соответствующий блок кода неспроста расположен после #блока накопления аннотаций: на диаграммах клеточки с #подпороговыми LD будут закрашены как нулевые, но #зато при наведении курсора там отобразятся настоящие #LD-значения, как раз извлекаемые из массива с аннотациями. #При обратном расположении этих блоков аннотации подпороговых #LD не сохранялись бы, ведь в блоке фильтрации - continue. if self.ld_low_thres != None: if trg_vals[self.ld_measure] < self.ld_low_thres: continue #Если значение LD не отсеилось как подпороговое, #то попадёт в LD-матрицу: 0-ячейка будет заменена #на найденное значение LD выбранной величины. ld_two_dim[row_index][col_index] = trg_vals[ self.ld_measure] #Стремящееся быть информативным название #конечного файла. Какое к нему далее будет #пристыковано расширение - зависит от #выбранного исследователем формата. trg_file_base = f'{src_file_base}_chr{chrom}_{self.ld_measure[0]}' #Визуализация матрицы с помощью plotly. if self.matrix_type in ['heatmap', 'both']: #Исследователь дал добро #выводить на диаграмму надписи: #rsIDs в качестве лейблов осей и #значения LD внутри квадратиков #непосредственно тепловой карты. if self.disp_letters: #Создание объекта аннотированной тепловой карты. #Из чего он состоит - см. в ридми к ld_triangle. #Здесь только отмечу, что create_annotated_heatmap - #высокоуровневая функция библиотеки plotly, #берущая на себя большую часть этой работы. ld_heatmap = ff.create_annotated_heatmap( ld_two_dim, x=rs_ids_srtd, y=rs_ids_srtd, hovertext=info_two_dim, hoverinfo='text', xgap=1, ygap=1, colorscale=self.color_pal, showscale=False) #Возможная кастомизация размера шрифта #подписей к осям и чисел в квадратиках. if self.font_size != None: ld_heatmap.layout.xaxis.tickfont.size = self.font_size ld_heatmap.layout.yaxis.tickfont.size = self.font_size for ann_num in range(len( ld_heatmap.layout.annotations)): ld_heatmap['layout']['annotations'][ann_num][ 'font']['size'] = self.font_size #Исследователь предпочёл выводить на тепловую #карту минимум текстовых данных. Жертвовать #надписями обычно приходится во избежание их #взамного наползания в крупных диаграммах. #Построим объект смысловой части диаграммы #и объект вторичных настроек, собирём #их в финальный объект. Подробнее о #структуре объектов plotly - в ридми. else: trace = go.Heatmap(z=ld_two_dim, hovertext=info_two_dim, hoverinfo='text', xgap=1, ygap=1, colorscale=self.color_pal, showscale=False) layout = go.Layout(xaxis_showticklabels=False, yaxis_showticklabels=False) ld_heatmap = go.Figure(data=trace, layout=layout) #Опциональное приведение #диаграммы к квадратной форме. if self.square_shape: ld_heatmap.update_layout(xaxis_constraintoward='left', yaxis_scaleanchor='x', yaxis_scaleratio=1, plot_bgcolor='rgba(0,0,0,0)') #Следующие настройки будут #касаться, в основном, надписей, #отличных от LD-значений и #rsIDs - заголовка и футера. #Чтобы размещать футер, пришлось #пойти на небольшую хитрость - #выделить под него тайтл оси X. #Помимо всего прочего, переворачиваем #диаграмму по Y ради визуальной #совместимости с хитмэпами LDmatrix. title = f''' defines color: {self.ld_measure} ░ LD threshold: {self.ld_low_thres} ░ chromosome: {chrom} ░ genders: {", ".join(self.gend_names)} ░ populations: {", ".join(self.pop_names)} ''' ld_heatmap.update_layout(title_text=title, xaxis_side='bottom', yaxis_autorange='reversed') if self.dont_disp_footer == False: footer = ''' made by ld_triangle from <a href="https://github.com/PlatonB/ld-tools">ld-tools</a> ░ readme: <a href="https://github.com/PlatonB/ld-tools/blob/master/README.md">ru</a> <a href="https://github.com/PlatonB/ld-tools/blob/master/README-EN.md">en</a> ░ <a href="https://www.tinkoff.ru/rm/bykadorov.platon1/7tX2Y99140/">donate</a> ''' ld_heatmap.update_layout(xaxis_title_text=footer, xaxis_title_font_size=10) #Прописывание всех данных диаграммы в #JSON, если это необходимо исследователю. if self.heatmap_json: debug_file_name = trg_file_base + '.json' ld_heatmap.write_json(os.path.join(trg_dir_path, debug_file_name), pretty=True) #Сохранение диаграммы в HTML. html_file_name = trg_file_base + '.html' ld_heatmap.write_html( os.path.join(trg_dir_path, html_file_name)) #Исследователь выбрал опцию создавать #табличные варианты LD-матриц. if self.matrix_type in ['table', 'both']: #Создание текстового конечного файла. Прописываем в него хэдер #с общими характеристиками матрицы, пустую строку и две шапки: #одна - с rsIDs, другая - с позициями. Потом прописываем #LD-строки, добавляя перед каждой из них тоже rsID и позицию. tsv_file_name = trg_file_base + '.tsv' with open(os.path.join(trg_dir_path, tsv_file_name), 'w') as tsv_file_opened: tab, poss_srtd = '\t', list( map(lambda pos: str(pos), poss_srtd)) tsv_file_opened.write( f'##General\tinfo:\t{self.ld_measure}\tchr{chrom}\t{tab.join(self.pop_names)}\t{tab.join(self.gend_names)}\n\n' ) tsv_file_opened.write('rsIDs\t\t' + '\t'.join(rs_ids_srtd) + '\n') tsv_file_opened.write('\tPositions\t' + '\t'.join(poss_srtd) + '\n') for row_index in range(vars_quan): line = '\t'.join(map(str, ld_two_dim[row_index])) + '\n' tsv_file_opened.write(rs_ids_srtd[row_index] + '\t' + poss_srtd[row_index] + '\t' + line)
#!/bin/python3.6 import sys from pysam import VariantFile import subprocess vcf_in = VariantFile(sys.argv[1]) new_header = new_header = vcf_in.header vcf_out = VariantFile(sys.argv[2], 'w', header=new_header) sv_out = sys.argv[2] + '.svtypeDEL.txt' indelArteFile = sys.argv[3] for record in vcf_in.fetch(): # import pdb; pdb.set_trace() try: if record.info["SVTYPE"] == 'DEL': with open(sv_out, 'a+') as svtype_out: svtype_out.write(str(record)) except KeyError: if len(record.ref) != len(record.alts[0]): # if InDel if ( "mutect2" in record.info["CALLERS"] or "vardict" in record.info["CALLERS"] ): # Support by either Vardict or Manta, ok. # Check if indel artefact # import pdb; pdb.set_trace() write = 1 cmdIndelArte = 'grep -w ' + str(record.pos) + ' ' + indelArteFile artefactLines = ( subprocess.run(cmdIndelArte, stdout=subprocess.PIPE, shell='TRUE').stdout.decode('utf-8').strip() ) for artefactLine in artefactLines.split("\n"): if (
def vcf_to_ref(outfile, vcf_file, rec_file, pop2sample, random_read_samples=[], pos_id="Physical_Pos", map_ids=["AA_Map"], default_map="AA_Map", rec_rate=1e-8, chroms=None, bed=None, lax_alleles=False): pprint(pop2sample) # get chromosomes with VariantFile(vcf_file.format(CHROM='1')) as vcf: if chroms is None: chroms = [i for i in vcf.header.contigs] else: chroms = parse_chroms(chroms) log_.info("chroms found: %s", chroms) sample2pop = defaultdict(list) for pop, v in pop2sample.items(): for sample in v: if sample in vcf.header.samples: sample2pop[sample].append(pop) samples = sample2pop.keys() pops = set(pop for s, v in sample2pop.items() for pop in v) pprint(sample2pop) pprint(pops) map_ids = ['map'] + map_ids data_cols = [f"{p}_{e}" for p in pops for e in EXT] with lzma.open(outfile, "wt") as ref: ref.write("chrom,pos,ref,alt,") if rec_file is None: ref.write("map,") else: ref.write(",".join(map_ids)) ref.write(",") ref.write(",".join(data_cols)) ref.write("\n") for chrom in chroms: # set up rec file if rec_file is not None: rec = pd.read_csv(rec_file.format(CHROM=chrom), sep=" ") if "chrom" in rec: rec = rec[rec.chrom == chrom] rec['map'] = rec[default_map] rec_file_cols = list((pos_id, *map_ids)) rec = rec[rec_file_cols] rec_iter = rec.iterrows() R0 = next(rec_iter)[1] R1 = next(rec_iter)[1] #skip chrom if empty with VariantFile(vcf_file.format(CHROM=chrom)) as vcf: try: V = next(vcf) except StopIteration: continue with VariantFile(vcf_file.format(CHROM=chrom)) as vcf: vcf.subset_samples(samples) for row in vcf.fetch(chrom): alt_ix = 0 if len(row.alleles) <= 1 or len(row.alleles) > 3: continue if len(row.alleles) == 3: alleles = [ i for v in row.samples.values() for i in v["GT"] ] if 3 in alleles: continue elif 1 in alleles and 2 in alleles: continue elif 1 not in alleles and 2 not in alleles: continue elif 1 in alleles: alt_ix = 0 elif 2 in alleles: alt_ix = 1 else: raise ValueError(f"weird alleles {row.alleles}") log_.debug( f"{row.chrom}, {row.pos}, {row.alleles}, {Counter(alleles)}" ) if row.alts[alt_ix] not in "ACGT" or lax_alleles: continue D = defaultdict(int) # rec stuff if rec_file is None: map_ = row.pos * rec_rate ref.write( f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]},{map_}," ) else: if R1 is None: map_ = R0[map_ids] elif row.pos <= R0[pos_id]: map_ = R0[map_ids] elif R0[pos_id] < row.pos <= R1[pos_id]: slope = (R1[map_ids] - R0[map_ids]) / (R1[pos_id] - R0[pos_id]) map_ = R0[map_ids] + slope * ( row.pos - R0[pos_id]) / (R1[pos_id] - R0[pos_id]) elif row.pos > R1[pos_id]: try: while row.pos > R1[pos_id]: R0, R1 = R1, next(rec_iter)[1] except StopIteration: R0, R1 = R1, None if R1 is None: map_ = R0[map_ids] else: slope = (R1[map_ids] - R0[map_ids]) / ( R1[pos_id] - R0[pos_id]) map_ = R0[map_ids] + slope * ( row.pos - R0[pos_id]) / (R1[pos_id] - R0[pos_id]) ref.write( f"{row.chrom},{row.pos},{row.ref},{row.alts[alt_ix]}," ) map_str = ",".join((str(m) for m in map_)) ref.write(f"{map_str},") sample_data = row.samples for s in sample_data: if s in random_read_samples: allele = sample_data[s]["GT"][0] if allele is not None: for pop in sample2pop[s]: D[f"{pop}_{EXT[allele > 0]}"] += 1 else: for allele in sample_data[s]["GT"]: if allele is not None: for pop in sample2pop[s]: D[f"{pop}_{EXT[allele > 0]}"] += 1 ref.write(",".join((str(D[c]) for c in data_cols))) ref.write("\n")