def read_vep(in_vep_vcf, variants): with pysam.VariantFile(in_vep_vcf) as ifile: csq_meta = ifile.header.info.get('CSQ', None) if csq_meta is None: raise Exception( 'No meta-information entry about CSQ INFO field found!') csq_header = csq_meta.description.split(':', 1)[1].strip().split('|') for record in ifile.fetch(): if len( record.alts ) > 1: # multi-allelic variants must be split into multiple bi-allelic VCF entries raise Exception( 'Multi-allelic VCF records are not supported. Multi-allelic variants must be split into multiple bi-allelic VCF entries.' ) variant_name = (record.chrom[3:] if record.chrom.startswith('chr') else record.chrom, record.pos, record.ref, record.alts[0]) variant = variants.get(variant_name, None) if variant is None: continue variant_csqs = set() lof = False for csq in record.info['CSQ']: csq = dict(zip(csq_header, csq.split('|'))) if csq['BIOTYPE'] != 'protein_coding': continue if csq['LoF'] == 'HC': lof = True csqs = csq['Consequence'].split('&') if any(x in csqs for x in cds_variant_types): variant_csqs.update(csqs) if not variant_csqs: variants.pop(variant_name) continue variant_most_severe_csq = None if 'splice_acceptor_variant' in variant_csqs or 'splice_donor_variant' in variant_csqs: variant_most_severe_csq = 'splice' elif 'stop_gained' in variant_csqs: variant_most_severe_csq = 'stop_gained' elif 'stop_lost' in variant_csqs: variant_most_severe_csq = 'stop_lost' elif 'start_lost' in variant_csqs: variant_most_severe_csq = 'start_lost' elif 'frameshift_variant' in variant_csqs: variant_most_severe_csq = 'frameshift' elif 'inframe_insertion' in variant_csqs: variant_most_severe_csq = 'inframe_insertion' elif 'inframe_deletion' in variant_csqs: variant_most_severe_csq = 'inframe_deletion' elif 'missense_variant' in variant_csqs: variant_most_severe_csq = 'missense' elif 'synonymous_variant' in variant_csqs or 'stop_retained_variant' in variant_csqs or 'start_retained_variant' in variant_csqs: variant_most_severe_csq = 'synonymous' else: print( f'WARNING (not in CDS): Variant {variant_name} ({variant_csqs}) will be omitted.' ) variants.pop(variant_name) continue categories = ['ALL', variant_most_severe_csq] if lof: categories.append('LOF') length = len(record.ref) - len(record.alts[0]) if length == 0: if len(record.ref) > 1: categories.append('MNP') else: categories.append('SNP') elif length > 0: categories.append('INDEL') categories.append('DEL') length = abs(length) if length < 4: categories.append(f'DEL:{length}') elif length < 10: categories.append('DEL:4-9') else: categories.append('DEL:10+') elif length < 0: categories.append('INDEL') categories.append('INS') length = abs(length) if length < 4: categories.append(f'INS:{length}') elif length < 10: categories.append('INS:4-9') else: categories.append('INS:10+') variant.ac = record.info['AC'][0] variant.an = record.info['AN'] variant.cat = categories
def count_sr(argv): parser = argparse.ArgumentParser( description="Count clipped reads at SV breakpoints. Unwindowed.", prog='svtk count-sr', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='VCF of variant calls. Standardized to include ' 'CHR2, END, SVTYPE, STRANDS in INFO.') parser.add_argument('countfile', help='Tabix indexed file of split counts.' ' Columns: chrom,pos,clip,count,sample') parser.add_argument('fout', help='Output table of split read counts.') parser.add_argument('--common', default=False, action='store_true', help='Ignore background for common AF') parser.add_argument('-s', '--samples', type=argparse.FileType('r'), default=None, help='Whitelist of samples to restrict testing to.') parser.add_argument( '--index', default=None, help='Tabix index of discordant pair file. Required if ' 'discordant pair file is hosted remotely.') # TODO: add normalization parser.add_argument('--medianfile', default=None, help='Median coverage statistics for each library ' '(optional). If provided, each sample\'s split ' 'counts will be normalized accordingly. ' 'Same format as RdTest, one column per sample.') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) vcf = pysam.VariantFile(args.vcf) if args.index is not None: countfile = pysam.TabixFile(args.countfile, index=args.index, parser=pysam.asTuple()) else: if args.countfile.startswith('http'): raise Exception('Must provide tabix index with remote URL') countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple()) if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = open(args.fout, 'w') header = 'name coord sample count'.split() fout.write('\t'.join(header) + '\n') if args.samples is not None: whitelist = [s.strip() for s in args.samples.readlines()] else: whitelist = [s for s in vcf.header.samples] if args.medianfile is not None: medians = pd.read_table(args.medianfile) medians = pd.melt(medians, var_name='sample', value_name='median_cov') else: medians = None srtest = SRTest(countfile, args.common, window=0, medians=medians) for record in vcf: for coord in 'start end'.split(): if coord == 'start': pos, strand, chrom = record.pos, record.info['STRANDS'][ 0], record.chrom else: # TODO: With a properly formatted VCF, should be using END2 instead of END here pos, strand, chrom = record.stop, record.info['STRANDS'][ 1], record.info['CHR2'] counts = srtest.load_counts(chrom, pos, strand) counts = srtest.normalize_counts(counts) counts = counts['sample count'.split()] counts = counts.set_index('sample') counts = counts.reindex(whitelist).fillna(0).astype(int) counts = counts.reset_index() counts['name'] = record.id counts['coord'] = coord for row in counts[header].values: fout.write('\t'.join([str(x) for x in row]) + '\n')
def get_variants(path): records = [] with pysam.VariantFile(path) as vcf: for rec in vcf: records.append(rec) return records
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Input vcf (supports "stdin").') parser.add_argument('minGQtable', help='Tab-delimited minGQ filtering lookup' + ' table generated by create_minGQ_lookup_table.R.') parser.add_argument('fout', help='Output file (supports "stdout").') parser.add_argument('-m', '--minGQ', help='Global min GQ', type=int, default=0, dest='globalMin') parser.add_argument('--multiallelics', default=False, action='store_true', help='Also apply filtering to multiallelic sites ' + '(default: skip multiallelics).') parser.add_argument( '--dropEmpties', default=False, action='store_true', help='After GT reassignments, drop any SV with no remaining ' + ' non-ref samples (default: keep all SV).') parser.add_argument( '--maxNCR', help='Max no-call rate among all ' + 'samples before adding a flag to the record\'s FILTER field' + ' (default: 0.005)', type=float, default=0.005, dest='maxNCR') parser.add_argument( '--cleanAFinfo', help='Remove all AF-related terms from ' + ' the INFO field and VCF header (default: keep all terms).', default=False, action='store_true') parser.add_argument('--prefix', help='Cohort label to append to NCR FILTER.', default='COHORT', dest='prefix') args = parser.parse_args() if args.vcf in '- stdin'.split(): vcf = pysam.VariantFile(sys.stdin) else: vcf = pysam.VariantFile(args.vcf) #Add HIGH_NOCALL_RATE filter to vcf header NEW_FILTER = '##FILTER=<ID=HIGH_{0}_NOCALL_RATE,Description="More than '.format(args.prefix) + \ '{:.2%}'.format(args.maxNCR) + ' of {0} sample GTs were '.format(args.prefix) + \ 'masked as no-call GTs due to low GQ. Indicates a possibly noisy locus ' + \ 'in {0} samples.>'.format(args.prefix) header = vcf.header header.add_line(NEW_FILTER) filter_text = 'HIGH_{0}_NOCALL_RATE'.format(args.prefix) if args.fout in '- stdout'.split(): fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header) else: fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) #Make dummy lookup tables for SVLEN, AF, SVTYPE, FILTER, and EV SVLEN_table = _make_SVLEN_interval_dict(args.minGQtable) AF_table = _make_AF_interval_dict(args.minGQtable) SVTYPE_table = _make_SVTYPE_dict(args.minGQtable) FILTER_table = _make_FILTER_dict(args.minGQtable, vcf) EV_table = _make_EV_dict(args.minGQtable) #Make minGQ lookup table minGQ_dict = make_minGQ_dict(args.minGQtable, SVLEN_table, AF_table, SVTYPE_table, FILTER_table, EV_table) #Iterate over records in vcf and apply filter for record in vcf.fetch(): #Do not process multiallelic variants, unless optioned if args.multiallelics or \ (not args.multiallelics and not _is_multiallelic(record)): apply_minGQ_filter(record, minGQ_dict, SVLEN_table, AF_table, SVTYPE_table, FILTER_table, EV_table, globalMin=args.globalMin, maxNCR=args.maxNCR, highNCR_filter=filter_text) if args.cleanAFinfo: for key in 'AN AC AF N_BI_GENOS N_HOMREF N_HET N_HOMALT FREQ_HOMREF FREQ_HET FREQ_HOMALT'.split( ' '): if key in record.info.keys(): record.info.pop(key) if args.dropEmpties: samps = svu.get_called_samples(record, include_null=False) if len(samps) > 0: fout.write(record) else: fout.write(record) fout.close()
def sr_test(argv): parser = argparse.ArgumentParser( description="Calculate enrichment of clipped reads at SV breakpoints.", prog='svtk sr-test', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='VCF of variant calls. Standardized to include ' 'CHR2, END, SVTYPE, STRANDS in INFO.') parser.add_argument('countfile', help='Tabix indexed file of split counts.' ' Columns: chrom,pos,clip,count,sample') parser.add_argument('fout', help='Output table of most significant start/end' 'positions.') parser.add_argument('-w', '--window', type=int, default=100, help='Window around variant start/end to consider for ' 'split read support. [100]') parser.add_argument('--common', default=False, action='store_true', help='Ignore background for common AF') parser.add_argument('-b', '--background', type=int, default=160, help='Number of background samples to choose for ' 'comparison in t-test. [160]') parser.add_argument('-s', '--samples', type=argparse.FileType('r'), default=None, help='Whitelist of samples to restrict testing to.') parser.add_argument( '--index', default=None, help='Tabix index of discordant pair file. Required if ' 'discordant pair file is hosted remotely.') # TODO: add normalization parser.add_argument('--medianfile', default=None, help='Median coverage statistics for each library ' '(optional). If provided, each sample\'s split ' 'counts will be normalized accordingly. ' 'Same format as RdTest, one column per sample.') parser.add_argument('--log', action='store_true', default=False, help='Print progress log to stderr.') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) vcf = pysam.VariantFile(args.vcf) if args.index is not None: countfile = pysam.TabixFile(args.countfile, index=args.index, parser=pysam.asTuple()) else: if args.countfile.startswith('http'): raise Exception('Must provide tabix index with remote URL') countfile = pysam.TabixFile(args.countfile, parser=pysam.asTuple()) if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = open(args.fout, 'w') header = 'name coord pos log_pval called_median bg_median bg_frac'.split() fout.write('\t'.join(header) + '\n') if args.samples is not None: whitelist = [s.strip() for s in args.samples.readlines()] else: whitelist = None if args.medianfile is not None: medians = pd.read_table(args.medianfile) medians = pd.melt(medians, var_name='sample', value_name='median_cov') else: medians = None runner = SRTestRunner(vcf, countfile, fout, args.background, args.common, args.window, whitelist, medians=medians, log=args.log) runner.run()
def load_vcf(vcf_file, info_keys=[], format_keys=[]): """Function to load VCF into gwas dataframe.""" # Load VCF file using pysam reader = pysam.VariantFile(vcf_file) if "*" in info_keys: header_dict = dict(reader.header.info) new_keys = [] for k in header_dict.keys(): new_keys.append(k) info_keys = new_keys if "*" in format_keys: header_dict = dict(reader.header.formats) new_keys = [] for k in header_dict.keys(): new_keys.append(k) format_keys = new_keys print(info_keys) info_keys = set(info_keys) print(format_keys) format_keys = set(format_keys) df_dict = defaultdict(list) for record in reader: if len(record.alts) != 1: continue if record.ref not in nucleotide_dict or record.alts[ 0] not in nucleotide_dict: continue # Run through all variants and all their keys in format for sample in record.samples: format_dict = dict(record.samples[sample]) for key, value in format_dict.items(): if key not in format_keys: continue # _add_basic_component(record, sample, df_dict) if key == "GT": if None in list(value): value = -1 else: value = sum(list(value)) _add_key_value(record, sample, f"call_{key}", value, df_dict) # Run through all variants and all their info keys info_dict = dict(record.info) for key, value in info_dict.items(): if key not in info_keys: continue # _add_basic_component(record, sample, df_dict) _add_key_value(record, sample, key, value, df_dict) df = pd.DataFrame.from_dict(df_dict) df, feature_mapping = _create_numerical_features(df) df = df.pivot_table( index=[ "chrom", "pos", "ref", "alt", "sample", "quality", "feature_id" ], columns="key", values="value", ).reset_index() cuda_df = cudf.DataFrame(df) return cuda_df
def annotate_vcf_with_inference(args): cnns = {} stats = Counter() vcf_reader = pysam.VariantFile(args.negative_vcf, 'rb') pyvcf_vcf_reader = vcf.Reader(open(args.negative_vcf, 'rb')) input_tensors = {} for a in args.architectures: cnns[a] = models.set_args_and_get_model_from_semantics(args, a) print('Annotating with architecture:', a, 'sample name is', args.sample_name) if not score_key_from_json(a) in vcf_reader.header.info: vcf_reader.header.info.add(score_key_from_json(a), '1', 'Float', 'Site-level score from Convolutional Neural Net named '+a+'.') if defines.annotations_from_args(args) is not None: input_tensors[args.annotation_set] = (len(args.annotations),) input_tensors[args.tensor_map] = defines.tensor_shape_from_args(args) vcf_writer = pysam.VariantFile(args.output_vcf, 'w', header=vcf_reader.header) print('got vcfs. input tensor shape mapping:', input_tensors) reference = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta")) print('got ref.') samfile = pysam.AlignmentFile(args.bam_file, "rb") print('got sam.') positions = [] variant_batch = [] time_batch = time.time() batch = {} for tm in input_tensors: batch[tm] = np.zeros(((args.batch_size,) + input_tensors[tm])) print('input tensors:', input_tensors) if args.chrom: print('iterate over region of vcf', args.chrom, args.start_pos, args.end_pos) variants = vcf_reader.fetch(args.chrom, args.start_pos, args.end_pos) else: print('iterate over vcf') variants = vcf_reader start_time = time.time() for variant in variants: idx_offset, ref_start, ref_end = get_variant_window(args, variant) args.chrom = variant.contig # In case chrom isn't set on command line we need it to fetch reads. contig = reference[variant.contig] record = contig[ ref_start : ref_end ] v = pysam_variant_in_pyvcf(variant, pyvcf_vcf_reader) for tm in batch: batch_key = tm+'_in_batch' if tm in defines.annotations: args.annotation_set = tm annotation_data = td.get_annotation_data(args, v, stats) batch[tm][stats[batch_key]] = annotation_data stats[batch_key] += 1 if 'read' in tm: args.tensor_map = tm if "read_tensor" == args.tensor_map: read_tensor = td.make_reference_and_reads_tensor(args, v, samfile, record.seq, ref_start, stats) elif "paired_reads" == args.tensor_map: read_tensor = td.make_paired_read_tensor(args, v, samfile, record.seq, ref_start, ref_end, stats) else: raise ValueError("Unknown read tensor mapping."+tt) batch[tm][stats[batch_key]] = read_tensor if read_tensor is None: print('got empty', args.tensor_map, 'tensor at:', v) batch[tm][stats[batch_key]] = np.zeros(input_tensors[tm]) stats[batch_key] += 1 if 'reference' in tm: args.tensor_map = tm reference_tensor = td.make_reference_tensor(args, record.seq) batch[tm][stats[batch_key]] = reference_tensor stats[batch_key] += 1 positions.append(variant.contig + '_' + str(variant.pos)) variant_batch.append(variant) if stats[batch_key] == args.batch_size: apply_cnns_to_batch(args, cnns, batch, positions, variant_batch, vcf_writer, stats) # Reset the batch positions = [] variant_batch = [] for tm in batch: batch_key = tm+'_in_batch' batch[tm] = np.zeros(((args.batch_size,) + input_tensors[tm])) stats[batch_key] = 0 stats['batches processed'] += 1 if stats['batches processed'] % 10 == 0: elapsed = time.time()-start_time v_per_minute = stats['batches processed']*args.batch_size / (elapsed/60) print('Variants per minute:', v_per_minute, 'Batches:', stats['batches processed'], 'batches. Last variant:', variant) if stats['batches processed']*args.batch_size > args.samples: break if stats[batch_key] > 0: apply_cnns_to_batch(args, cnns, batch, positions, variant_batch, vcf_writer, stats) for s in stats.keys(): print(s, 'has:', stats[s])
def main(): # Exception handling for input files format. if args['f'].endswith(".fa") or args['f'].endswith(".fasta"): genome = pysam.FastaFile(args['f']) # open fasta file print('[ OK ] Reading Fasta file is done.') else: raise FileFormatError("\n[ ERROR ] Input File is not in Fasta format.") if args['v'].endswith(".vcf"): vcf = pysam.VariantFile(args['v']) # open vcf file print('[ OK ] Reading vcf file is done.') else: raise FileFormatError("\n[ ERROR ] Input File is not in VCF format.") k = args['k'] # Length of kmer # Handling different output formats. if args['outfmt'].upper() == 'TSV': with open(args['o']+'/'+args['outfile']+'.tsv','w') as fd: fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v'])) head_dict={'chr': 'Chr', 'pos': 'Pos', 'id': 'Mutation-ID', 'ref':'Ref-Allele' , 'alt':'Mut-Allele', 'refk':'Ref-Kmers', 'mutk':'Mut-Kmers'} write_in_tsv("Head",head_dict) elif args['outfmt'].upper() == 'XML': with open(args['o']+'/'+args['outfile']+'.xml','w') as fd: fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v'])) elif args['outfmt'] == 'both': with open(args['o']+'/'+args['outfile']+'.tsv','w') as fd: fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v'])) with open(args['o']+'/'+args['outfile']+'.xml','w') as fd: fd.write('## VMK-mer version: v1.0\n## Output file: {}\n## Reference fasta file: {}\n## VCF file: {}\n'.format(args['o']+'/'+args['outfile'],args['f'],args['v'])) print('[ PROCESS ] Extracting mutant kmers, please wait...') total = recordcount("v", vcf) iterations = 0 for record in vcf: # Handling the mutation type included in the info tag of some VCF file as "TSA". if 'TSA' in record.info.keys(): mutation_type = str(record.info['TSA']) if mutation_type == "SNV": snp(record, genome, k) #pass elif mutation_type == "insertion": insertion(record, genome, k) #pass elif mutation_type == "deletion": deletion(record, genome, k) #pass # Handling the mutation type included in the info tag of some VCF file as "VT". elif 'VT' in record.info.keys(): mutation_type = str(record.info['VT'][0]) if mutation_type == "SNP": snp(record, genome, k) #pass elif mutation_type == "INDEL": if len(record.alts[0]) > len(record.ref): insertion(record, genome, k) #pass elif len(record.alts[0]) < len(record.ref): deletion(record, genome, k) #pass iterations += 1 progress (iterations, total) print('\n[ OK ] All kmers have been extracted successfully.')
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument( "-i", "--input-vcf", dest="input_vcf_file", type=str, help="input vcf file") parser.add_argument( "-t", "--truth-vcf", dest="truth_vcf_file", type=str, help="truth vcf file") parser.add_argument( "-f", "--input-fasta", dest="input_fasta_file", type=str, help="input fasta file. faidx indexed reference sequence file to " "determine INDEL context ") parser.add_argument( "-e", "--input-bed", dest="input_bed_file", type=str, help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. ") parser.add_argument( "-m", "--method", dest="methods", action="append", type=str, choices=("mutational-signature", "kinship"), help="methods to apply ") parser.set_defaults( methods=[], input_vcf_file=None, input_bed_file=None, input_fasta_file=None, truth_vcf_file=None, ) (args, unknown) = E.start(parser, argv, add_output_options=True, unknowns=True) if len(unknown) == 1: args.input_vcf_file = unknown[0] if args.input_vcf_file is None: raise ValueError("please supply a VCF file") if args.truth_vcf_file is None: raise ValueError("please supply a VCF file with truth data") if args.input_fasta_file is None: raise ValueError("please supply a fasta file with the reference genome") if not os.path.exists(args.input_vcf_file): raise OSError("input vcf file {} does not exist".format( args.input_vcf_file)) if not os.path.exists(args.input_vcf_file + ".tbi"): raise OSError("input vcf file {} needs to be indexed".format( args.input_vcf_file)) if not os.path.exists(args.truth_vcf_file): raise OSError("truth vcf file {} does not exist".format( args.truth_vcf_file)) if not os.path.exists(args.truth_vcf_file + ".tbi"): raise OSError("truth vcf file {} needs to be indexed".format( args.truth_vcf_file)) if not os.path.exists(args.input_fasta_file): raise OSError("input fasta file {} does not exist".format( args.input_fasta_file)) if not os.path.exists(args.input_fasta_file + ".fai"): raise OSError("input fasta file {} needs to be indexed".format( args.input_fasta_file)) # update paths to absolute args.input_fasta_file = os.path.abspath(args.input_fasta_file) args.input_vcf_file = os.path.abspath(args.input_vcf_file) args.truth_vcf_file = os.path.abspath(args.truth_vcf_file) test_vcf = pysam.VariantFile(args.input_vcf_file) truth_vcf = pysam.VariantFile(args.truth_vcf_file) contigs = test_vcf.header.contigs truth_contigs = set(truth_vcf.header.contigs) test_vcf_samples = set(test_vcf.header.samples) truth_vcf_samples = set(truth_vcf.header.samples) common_samples = test_vcf_samples.intersection(truth_vcf_samples) if len(common_samples) == 0: raise ValueError("no common samples in test/truth VCFs") def pair_iterator(test_vcf, truth_vcf, contig): counter = E.Counter() test_iter = test_vcf.fetch(contig) truth_iter = truth_vcf.fetch(contig) test_record = next(test_iter) truth_record = next(truth_iter) try: while 1: if test_record.pos < truth_record.pos: test_record = next(test_iter) continue elif test_record.pos > truth_record.pos: truth_record = next(truth_iter) continue elif len(test_record.alts) > 1: counter.skip_test_truth += 1 test_record = next(test_iter) continue elif len(truth_record.alts) > 1: counter.skip_multiallelic_truth += 1 truth_record = next(truth_iter) continue elif test_record.alts != truth_record.alts: counter.skip_genotype_difference += 1 test_record = next(test_iter) truth_record = next(truth_iter) continue if test_record.ref != truth_record.ref: # todo: deal with indels raise ValueError( "mismatching reference bases at position " "{}:{}".format(test_record.chrom, test_record.pos)) yield test_record, truth_record test_record = next(test_iter) truth_record = next(truth_iter) except StopIteration: pass E.debug(str(counter)) counters_per_contig = {} for contig in contigs: counter_contig = collections.defaultdict(E.Counter) counters_per_contig[contig] = counter_contig E.info("processing contig {}".format(contig)) if contig not in truth_contigs: E.warn( "skipping contig {} as it is not in truth data".format(contig)) continue switch = False last_is_unphased = True for test_record, truth_record in pair_iterator(test_vcf, truth_vcf, contig): for sample in common_samples: counter = counter_contig[sample] truth_phased = truth_record.samples[sample].phased test_phased = test_record.samples[sample].phased truth_genotype = truth_record.samples[sample]["GT"] test_genotype = test_record.samples[sample]["GT"] truth_alleles = set(truth_genotype) test_alleles = set(test_genotype) ignore = False if not truth_phased: counter.truth_unphased += 1 ignore = True if not test_phased: counter.test_unphased += 1 ignore = True last_is_unphased = True else: last_is_unphased = False if len(test_alleles) == 1: counter.test_homozygous += 1 ignore = True else: if not test_phased: counter.test_unphased_hets += 1 if len(truth_alleles) == 1: counter.truth_homozygous += 1 ignore = True if ignore: counter.ignore += 1 continue E.debug("comparing: {}:{} {} -> {}: {} {}".format( test_record.chrom, test_record.pos, test_record.ref, test_record.alts, test_genotype, truth_genotype)) if switch: truth_genotype = truth_genotype[::-1] counter.test_phased_hets += 1 if truth_genotype != test_genotype: if not last_is_unphased: E.debug("SWITCH: {}".format(switch)) counter.switch += 1 switch = not switch outf = args.stdout outf.write("\t".join(("contig", "sample", "switch_error_percent", "false_negative_rate", "switches", "test_phased_hets", "test_unphased_hets", "test_unphased", "truth_unphased", "test_homozygous", "truth_homozygous")) + "\n") for contig, contig_dict in list(counters_per_contig.items()): for sample, c in list(contig_dict.items()): outf.write("\t".join( map(str, ( contig, sample, "{:6.4f}".format(100.0 * c.switch / (c.test_phased_hets + 1)), "{:6.4f}".format(100.0 * c.test_unphased_hets / (c.test_phased_hets + c.test_unphased_hets)), c.switch, c.test_phased_hets, c.test_unphased_hets, c.test_unphased, c.truth_unphased, c.test_homozygous, c.truth_homozygous))) + "\n") E.stop()
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtools standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('bed', type=argparse.FileType('r'), help='RdTest-formatted bed file. ' '(chrom, start, end, name, samples, svtype)') parser.add_argument('samples', help='List of all samples present in ' 'variant callset.') parser.add_argument('fout', help='Standardized VCF. Will be compressed ' 'with bgzip and tabix indexed if filename ends with ' '.gz') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Get template header template = pkg_resources.resource_filename('svtools', 'data/standard_template.vcf') template = pysam.VariantFile(template) header = template.header # Get list of samples with open(args.samples) as slist: samples = sorted([s.strip() for s in slist.readlines()]) # Template header includes all necessary FILTER, INFO, and FORMAT fields # Just need to add list of samples for sample in samples: header.add_sample(sample) # Tag source in header meta = ('##FORMAT=<ID=depth,Number=1,Type=Integer,' 'Description="Called by read-depth algorithms">') header.add_line(meta) header.add_line('##source=depth') if args.fout.endswith('.vcf.gz'): fname = os.path.splitext(args.fout)[0] elif args.fout.endswith('.vcf'): fname = args.fout else: msg = 'Invalid VCF filename; must end with .vcf or .vcf.gz: {0}' msg = msg.format(args.fout) raise ValueError(msg) fout = pysam.VariantFile(fname, mode='w', header=header) rdtest2vcf(args.bed, fout) # TODO: do this with subprocess so we don't have to write to disk twice if args.fout.endswith('.gz'): pysam.tabix_compress(fname, args.fout) pysam.tabix_index(args.fout, preset='vcf') os.remove(fname)
""" Read the input vcf and use the model predictions to filter vcf by some threshold """ import sys import pysam vcf_file = sys.argv[1] bed_file = sys.argv[2] threshold = float(sys.argv[3]) predictions = {} genotypes = {0: (0, 0), 1: (0, 1), 2: (1, 1)} # go through the predictions bed file # and get the probability distribution for l in open(bed_file, 'r'): A = l.rstrip().split() key = '\t'.join(A[:3]) predictions[key] = [float(x) for x in A[3:]] # iterate over each record and replace the genotype # with the predicted genotypes with pysam.VariantFile(vcf_file, 'rb') as VCF: print(str(VCF.header).strip()) for variant in VCF: key = '\t'.join( [str(x) for x in [variant.contig, variant.pos, variant.stop]]) if key in predictions: if predictions[key][0] < threshold: print(str(variant).rstrip())
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-v', '--variants', required=True, help='Default VCF') parser.add_argument('-r', '--RDtest') parser.add_argument('-b', '--BAFtest') parser.add_argument('-s', '--SRtest') parser.add_argument('-p', '--PEtest') parser.add_argument('--batch-list', type=argparse.FileType('r')) parser.add_argument('--segdups', required=True) parser.add_argument('--rmsk', required=True) parser.add_argument('--fam') parser.add_argument('-d', '--bed', action='store_true', default=False) parser.add_argument('fout') args = parser.parse_args() if args.bed: if not hasattr(args, 'batch_list'): raise Exception('batch list must be specified when passing a bed') variants = open(args.variants) dtypes = 'RD BAF'.split() else: variants = pysam.VariantFile(args.variants) dtypes = 'PE SR RD BAF'.split() metadata = process_metadata(variants, args.bed, args.batch_list) # Calculate segdup coverage bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()]) segdups = pbt.BedTool(args.segdups) cov = bt.coverage(segdups).to_dataframe() metadata['poor_region_cov'] = cov.thickStart # Check if endpoints are in repeat-masked sequence starts = metadata['chrom start end name'.split()].copy() starts['end'] = starts['start'] + 1 ends = metadata['chrom start end name'.split()].copy() ends['start'] = ends['end'] - 1 endpoints = pd.concat([starts, ends]) bt = pbt.BedTool.from_dataframe(endpoints) rmsk = pbt.BedTool(args.rmsk) sect = bt.intersect(rmsk, u=True) rmasked_names = [i.fields[3] for i in sect.intervals] metadata['rmsk'] = metadata.name.isin(rmasked_names) metadata = metadata.set_index('name') evidence = deque() for dtype in dtypes: dtable = getattr(args, dtype + 'test') if dtable is None: continue df = pd.read_table(dtable) df = preprocess(df, dtype) df = df.rename(columns=lambda c: dtype + '_' + c if c != 'name' else c) df = df.set_index('name') evidence.append(df) evidence = list(evidence) evidence = metadata.join(evidence, how='outer', sort=True) evidence = evidence.reset_index().rename(columns={'index': 'name'}) has_petest = (getattr(args, 'PEtest') is not None) has_srtest = (getattr(args, 'SRtest') is not None) if not args.bed and has_petest and has_srtest: evidence = add_pesr(evidence) # Replace infinite log-pvals LOG_CEIL = 300 evidence = evidence.replace(np.inf, LOG_CEIL) evidence = evidence.reindex(columns=make_columns()) evidence.to_csv(args.fout, index=False, sep='\t', na_rep='NA')
def _parse_sam_file_and_vcf(cls, samfile, query_vcf_file, flank_length, allow_mismatches, exclude_regions=None, max_soft_clipped=3, number_ns=0): if exclude_regions is None: exclude_regions = {} found = [] match_flag = [] correct_allele = [] gt_conf = [] allele = [] samfile_handle = pysam.AlignmentFile(samfile, "r") sam_previous_record_name = None for sam_record in samfile_handle.fetch(until_eof=True): if sam_record.query_name == sam_previous_record_name: continue sam_previous_record_name = sam_record.query_name found_conf = False found_allele = False # see if excluded region in bed file ref, start, ref_num, var_num, allele_num = sam_record.query_name.rsplit('.', maxsplit=5) start = int(start) + flank_length exclude = False for ref_name in exclude_regions.keys(): end = int(start) + 1 interval = pyfastaq.intervals.Interval(start, end) exclude = EvaluateRecall._interval_intersects_an_interval_in_list(interval, exclude_regions[ref_name]) if exclude: found.append('Exclude') gt_conf.append(0) allele.append('0') continue match = EvaluateRecall._check_if_sam_match_is_good(sam_record, flank_length, query_sequence=sam_record.query_sequence, allow_mismatches=allow_mismatches, max_soft_clipped=max_soft_clipped) alignment_start = str(sam_record).split("\t")[3] match_flag.append(match) if match == 'Good': logging.debug('SAM record is a good match') logging.debug('SAM record reference is %s' %sam_record.reference_name) ref_name, expected_start, vcf_pos_index, vcf_record_index, allele_index = sam_record.reference_name.rsplit('.', maxsplit=4) vcf_reader = pysam.VariantFile(query_vcf_file) vcf_interval_start = int(expected_start) + int(alignment_start) + flank_length - 2 - number_ns vcf_interval_end = int(expected_start) + int(alignment_start) + flank_length - number_ns logging.debug('Find VCF records matching ref %s in interval [%i,%i]' %(ref_name, vcf_interval_start, vcf_interval_end)) for i, vcf_record in enumerate(vcf_reader.fetch(ref_name, vcf_interval_start, vcf_interval_end)): if i == int(vcf_pos_index): sample_name = vcf_record.samples.keys()[0] if 'GT' in vcf_record.format.keys() and len(set(vcf_record.samples[sample_name]['GT'])) == 1: if int(allele_index) == int(vcf_record.samples[sample_name]['GT'][0]): found.append('1') allele.append(str(allele_index)) correct_allele.append('1') found_allele = True if 'GT_CONF' in vcf_record.format.keys(): gt_conf.append(int(float(vcf_record.samples[sample_name]['GT_CONF']))) found_conf = True if not found_allele: found.append('0') allele.append('0') correct_allele.append('0') if not found_conf: gt_conf.append(0) assert len(found) == len(gt_conf) assert len(found) == len(allele) assert len(found) == len(match_flag) assert len(found) == len(correct_allele) return found, gt_conf, allele, match_flag, correct_allele
if not os.path.isfile(args.vcf)==True: print("Cannot find input file ",args.vcf) sys.exit(1) if not (os.path.isfile(args.vcf+".tbi")==True or os.path.isfile(args.vcf+".csi")==True ): call(["bcftools","index",args.vcf]) # Merge the file with ALT variants alts='/home/mzarowiecki/scratch/REF/allASDPs.SNV.50_10.valid.vcf.gz' call(["bcftools", "merge","--force-samples","-O","z","-o",args.vcf+".asdp.vcf.gz",args.vcf,alts]) # read the input file myvcf = pysam.VariantFile(args.vcf+".asdp.vcf.gz", "r") # create an object of new bed file and open in to write data. output=args.vcf+".asdp.res.vcf.gz" out = open(output +'.review', 'w') vaf = open(output +'.vaf', 'w') myvcf.header.info.add("ALT", "1", "String", "Is variant on ALT or primary") # create an object of new vcf file and open in to write data. vcf_out = pysam.VariantFile(output, 'w', header=myvcf.header)
def retrieve_entry_from_test_query_vcf(idx: int) -> pysam.VariantRecord: with pysam.VariantFile(TEST_QUERY_VCF) as vcf: for i, record in enumerate(vcf): if i == idx: return record raise IndexError("You asked for an index that is beyond the number in the test VCF")
def read_vcf(infile, sample_id=None, normal_id=None, min_depth=None, skip_reject=False, skip_somatic=False): """Read one tumor-normal pair or unmatched sample from a VCF file. By default, return the first tumor-normal pair or unmatched sample in the file. If `sample_id` is a string identifier, return the (paired or single) sample matching that ID. If `sample_id` is a positive integer, return the sample or pair at that index position, counting from 0. """ # if isinstance(infile, basestring): # vcf_reader = vcf.Reader(filename=infile) # else: # vcf_reader = vcf.Reader(infile) try: vcf_reader = pysam.VariantFile(infile) except Exception as exc: raise ValueError("Must give a VCF filename, not open file handle: %s" % exc) if vcf_reader.header.samples: sid, nid = _choose_samples(vcf_reader, sample_id, normal_id) logging.info("Selected test sample " + str(sid) + (" and control sample %s" % nid if nid else '')) # NB: in-place vcf_reader.subset_samples(list(filter(None, (sid, nid)))) else: logging.warn("VCF file %s has no sample genotypes", infile) sid = sample_id nid = None columns = [ 'chromosome', 'start', 'end', 'ref', 'alt', 'somatic', 'zygosity', 'depth', 'alt_count' ] if nid: columns.extend(['n_zygosity', 'n_depth', 'n_alt_count']) rows = _parse_records(vcf_reader, sid, nid, skip_reject) table = pd.DataFrame.from_records(rows, columns=columns) table['alt_freq'] = table['alt_count'] / table['depth'] if nid: table['n_alt_freq'] = table['n_alt_count'] / table['n_depth'] table = table.fillna({col: 0.0 for col in table.columns[6:]}) # Filter out records as requested cnt_depth = cnt_som = 0 if min_depth: if table['depth'].any(): dkey = 'n_depth' if 'n_depth' in table else 'depth' idx_depth = table[dkey] >= min_depth cnt_depth = (~idx_depth).sum() table = table[idx_depth] else: logging.warn("Depth info not available for filtering") if skip_somatic: idx_som = table['somatic'] cnt_som = idx_som.sum() table = table[~idx_som] logging.info("Loaded %d records; skipped: %d somatic, %d depth", len(table), cnt_som, cnt_depth) # return sid, nid, table return table
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-s", "--sample-size", dest="sample_size", type="float", help= "sample size. If less than 0, take a proportion of the chromosome size. " "If greater than 0, take a fixed number of variants [%default]") parser.add_option( "--input-filename-fasta", dest="input_filename_fasta", type="string", help="filename with reference sequence in fasta format [%default]") parser.add_option("--input-filename-bam", dest="input_filename_bam", type="string", help="filename with aligned reads [%default]") parser.add_option("--no-vcf-columns", dest="no_vcf_columns", action="store_true", help="do not output vcf columns") parser.add_option( "--counter", dest="counters", type="choice", action="append", choices=["context", "bam-indels", "bam-allelic-depth", "indel-type"], help="counters to apply [%default]") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf=None, sample_size=0.001, sample_name="NA12878", region_size=20, threshold_homopolymer=12, threshold_repeat=5, no_vcf_columns=False, counters=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: options.input_filename_vcf = args[0] vcf_in = pysam.VariantFile(options.input_filename_vcf) counters = [] if options.input_filename_fasta: fasta = pysam.FastaFile(options.input_filename_fasta) else: fasta = None if options.input_filename_bam: bam = pysam.AlignmentFile(options.input_filename_bam) else: bam = None for counter in options.counters: if counter == "context": counters.append(CounterContext(fasta)) elif counter == "bam-indels": counters.append(CounterBAMIndels(bam)) elif counter == "bam-allelic-depth": counters.append(CounterBAMAllelicDepth(bam)) elif counter == "indel-type": counters.append(CounterIndelType()) outf = options.stdout if not options.no_vcf_columns: header = str( vcf_in.header).strip().split("\n")[-1].strip()[1:].split("\t") else: header = ["chrom", "pos"] outf.write("\t".join(header)) for counter in counters: outf.write("\t" + "\t".join(counter.header)) outf.write("\n") for record in vcf_in: for counter in counters: counter.count(record) if not options.no_vcf_columns: outf.write("{}\t".format(str(record).strip())) else: outf.write("{}\t{}\t".format(record.chrom, record.pos)) outf.write("\t".join(map(str, counters)) + "\n") E.stop()
def recall_variants(args): variants, alignment_file_path, target_path, mode, germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size = args window_radius = 600 MAX_REF_MOLECULES = 1_000 # Maximum amount of reference molecules to process. # This is capped for regions to which many reads map (mapping artefact) variant_calls = dict() # cell->(chrom,pos) +/- ? ### Set up molecule iterator (1/2) if mode == 'NLA': mc = NlaIIIMolecule fc = NLAIIIFragment else: mc = Molecule fc = Fragment ### locations_done = set() alignments = pysam.AlignmentFile(alignment_file_path, threads=4) if germline_bam_path is not None: germline_alignments = pysam.AlignmentFile(germline_bam_path, threads=4) for variant in variants: # Check if the variant is present in the germline bam file (if supplied) if germline_bam_path is not None and has_variant_reads( germline_alignments, variant.chrom, variant.pos - 1, variant.alts[0], min_reads=1, stepper='nofilter'): print(f'FOUND IN GERMLINE {variant}') continue #print(variant) overlap = False reference_start = max(0, variant.pos - window_radius) reference_end = variant.pos + window_radius contig = variant.contig variant_key = (contig, variant.pos, variant.ref, variant.alts[0]) #print(contig,reference_start,reference_end,variant.alts[0],variant.ref) ### Set up allele resolver unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver( use_cache=False, phased=False, verbose=True) if germline_variants_path is not None: with pysam.VariantFile(germline_variants_path) as germline: for i, ar_variant in enumerate( germline.fetch(variant.chrom, reference_start, reference_end)): if germline_variants_sample is None: # If any of the samples is not heterozygous: continue if any((ar_variant.samples[sample].alleles != 2 for sample in ar_variant.samples)): continue elif len( set(ar_variant.samples[germline_variants_sample]. alleles)) != 2: continue unphased_allele_resolver.locationToAllele[ ar_variant.chrom][ar_variant.pos - 1] = { ar_variant.alleles[0]: {'U'}, ar_variant.alleles[1]: {'V'} } #### ref_phased = Counter() alt_phased = Counter() ### Set up molecule iterator (2/2) try: molecule_iter = MoleculeIterator(alignments, mc, fc, contig=contig, start=reference_start, end=reference_end, molecule_class_args={ 'allele_resolver': unphased_allele_resolver, 'max_associated_fragments': 20, }, max_buffer_size=max_buffer_size) reference_called_molecules = [] # molecule, phase extracted_base_call_count = 0 alt_call_count = 0 for mi, molecule in enumerate(molecule_iter): base_call = get_molecule_base_calls(molecule, variant) if base_call is None: continue extracted_base_call_count += 1 base, quality = base_call call = None if base == variant.alts[0]: call = 'A' alt_call_count += 1 if molecule.sample not in variant_calls: variant_calls[molecule.sample] = {} variant_calls[molecule.sample][variant_key] = 1 elif base == variant.ref: call = 'R' if call is None: continue # Obtain all germline variants which are phased : phased = get_phased_variants(molecule, unphased_allele_resolver) if call == 'R' and len(phased) > 0: # If we can phase the alternative allele to a germline variant # the reference calls can indicate absence if len(reference_called_molecules) < MAX_REF_MOLECULES: reference_called_molecules.append((molecule, phased)) for chrom, pos, base in phased: if call == 'A': alt_phased[(chrom, pos, base)] += 1 elif call == 'R': ref_phased[(chrom, pos, base)] += 1 except MemoryError: print(f"Buffer exceeded for {variant.contig} {variant.pos}") continue #print(mi,extracted_base_call_count,alt_call_count) if len(alt_phased) > 0 and len(reference_called_molecules): # Clean the alt_phased variants for variants which are not >90% the same alt_phased_filtered = filter_alt_calls(alt_phased, 0.9) #print(alt_phased_filtered) for molecule, phased_gsnvs in reference_called_molecules: for p in phased_gsnvs: if p in alt_phased_filtered: if not molecule.sample in variant_calls: variant_calls[molecule.sample] = {} variant_calls[molecule.sample][variant_key] = 0 break locations_done.add(variant_key) alignments.close() return variant_calls, locations_done
get_conf_int.output_non_cov_call_info(output_dir, SV_positions_file, assem1_non_cov_regions_file, assem2_non_cov_regions_file) #get filtered sv info, using results from get_conf_int.py exclude_assem1_non_cover, exclude_assem2_non_cover = validate.get_filtered_sv_pos( output_dir + "exclude_assem1_non_cover.bed", output_dir + "exclude_assem2_non_cover.bed") dict_centromere = validate.build_centro_dict(centromere_file) ################################################################################## ################################################################################## #index SVs f = pysam.VariantFile(vcf_file, 'r') sv_list = [] for count, rec in enumerate(f.fetch()): #get sv_type try: sv_type = rec.info['SVTYPE'] except: print("invalid sv type info") continue if first_filter(rec, sv_type): continue #get sv length if sv_type == 'INV': sv_len = abs(rec.stop - rec.pos + 1)
def job_gen(induced_variants_path, germline_variants_path, germline_variants_sample, alignments_path, block_size=100, n=None, contig=None, completed=None, min_qual=None, germline_bam_path=None, MAX_REF_MOLECULES=1000, window_radius=600, max_buffer_size=100_000): """ Job generator block_size(int) : variants per block n(int) : amount of blocks to generate min_qual(float) : minimum quality score of variants to process contig: contig to generate jobs for completed(set): set of locations which should be skipped """ i = 0 with pysam.VariantFile(induced_variants_path, ignore_truncation=True) as sc_calls: vlist = [] for record in sc_calls: if contig is not None and record.chrom != contig: continue if completed is not None and (record.chrom, record.pos) in completed: continue if min_qual is not None and record.qual < min_qual: continue if len(record.alts[0]) != 1 or len(record.ref) != 1: continue k = (record.chrom, record.pos) vlist.append(VariantWrapper(record)) if len(vlist) >= block_size: #f'./{extraction_folder}/variants_extracted_0_NLA_{i}.bam' yield (vlist, alignments_path, None, 'NLA', germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size) vlist = [] i += 1 if n is not None and i >= n: break if len(vlist): yield (vlist, alignments_path, None, 'NLA', germline_variants_path, germline_variants_sample, germline_bam_path, window_radius, MAX_REF_MOLECULES, max_buffer_size)
introns = annot.features.introns() features = { k: annot[annot.Feature == k].drop() for k in ['CDS', 'five_prime_utr', 'three_prime_utr'] } features['intron'] = introns.drop() # high confidence regions highc = pyranges.read_bed(snakemake.input.hc_bed) # exome sequencing target regions exometarg = pyranges.read_bed(snakemake.input.es_bed) # load the variants into a pyranges object prpm = vcf_to_pyranges(pysam.VariantFile(snakemake.input.vcf), tmpfile=snakemake.output.tsv + '_tmp.bed') # count overlaps to different feature types prpm = pyranges.count_overlaps(features, prpm) # annotation by majority vote main_anno = np.argmax( prpm.as_df()[['CDS', 'five_prime_utr', 'three_prime_utr', 'intron']].values, axis=1) d = { i: k for i, k in enumerate( ['CDS', 'five_prime_utr', 'three_prime_utr', 'intron']) } main_anno = pd.Series(main_anno).map(d)
help="VCF file") # Check for no input if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() # Check if input files exist if not os.path.isfile(args.vcf) == True: print("Cannot find input file ", args.vcf) sys.exit(1) # read the input file myvcf = pysam.VariantFile(args.vcf, "r") # create an object of new bed file and open in to write data. output = args.vcf + ".bed" out = open(output, 'w') for r in myvcf: #### FILTER OUT ##### # Shared called total # Filter out sites which chr = r.chrom pos = r.pos id = str(r.id) varID = ':'.join([id.split(":")[0], id.split(":")[1]]) #altb = r.ref
def pe_test(argv): parser = argparse.ArgumentParser( description= "Calculate enrichment of discordant pairs at SV breakpoints.", prog='svtk pe-test', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Variants.') parser.add_argument('disc', help='Table of discordant pair coordinates.') parser.add_argument('fout', type=argparse.FileType('w'), help='Output table of PE counts.') parser.add_argument('-o', '--window-out', type=int, default=500, help='Window outside breakpoint to query for ' 'discordant pairs. [500]') parser.add_argument('-i', '--window-in', type=int, default=50, help='Window inside breakpoint to query for ' 'discordant pairs. [50]') parser.add_argument('-b', '--background', type=int, default=160, help='Number of background samples to sample for PE ' 'evidence. [160]') parser.add_argument('--common', default=False, action='store_true', help='Ignore background for common AF') parser.add_argument('-s', '--samples', type=argparse.FileType('r'), default=None, help='Whitelist of samples to restrict testing to.') parser.add_argument( '--index', default=None, help='Tabix index of discordant pair file. Required if ' 'discordant pair file is hosted remotely.') parser.add_argument('--medianfile', default=None, help='Median coverage statistics for each library ' '(optional). If provided, each sample\'s split ' 'counts will be normalized accordingly. ' 'Same format as RdTest, one column per sample.') parser.add_argument('--log', action='store_true', default=False, help='Print progress log to stderr.') if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.vcf in '- stdin'.split(): vcf = pysam.VariantFile(sys.stdin) else: vcf = pysam.VariantFile(args.vcf) if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = args.fout header = 'name log_pval called_median bg_median bg_frac'.split() args.fout.write('\t'.join(header) + '\n') if args.samples is not None: whitelist = [s.strip() for s in args.samples.readlines()] else: whitelist = None if args.index is not None: discfile = pysam.TabixFile(args.disc, index=args.index) else: if args.disc.startswith('http'): raise Exception('Must provide tabix index with remote URL') discfile = pysam.TabixFile(args.disc) if args.medianfile is not None: medians = pd.read_table(args.medianfile) medians = pd.melt(medians, var_name='sample', value_name='median_cov') else: medians = None runner = PETestRunner(vcf, discfile, fout, args.background, args.common, args.window_in, args.window_out, whitelist, medians=medians, log=args.log) runner.run()
def read_vcf_samples(vcf_filename): vcf = ps.VariantFile(str(vcf_filename)) return vcf.header.samples
def count_pe(argv): parser = argparse.ArgumentParser( description="Count discordant pairs supporting a SV breakpoints.", prog='svtk count-pe', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Variants.') parser.add_argument('disc', help='Table of discordant pair coordinates.') parser.add_argument('fout', type=argparse.FileType('w'), help='Output table of PE counts.') parser.add_argument('-o', '--window-out', type=int, default=500, help='Window outside breakpoint to query for ' 'discordant pairs. [500]') parser.add_argument('-i', '--window-in', type=int, default=50, help='Window inside breakpoint to query for ' 'discordant pairs. [50]') parser.add_argument('--common', default=False, action='store_true', help='Ignore background for common AF') parser.add_argument('-s', '--samples', type=argparse.FileType('r'), default=None, help='Whitelist of samples to restrict testing to.') parser.add_argument( '--index', default=None, help='Tabix index of discordant pair file. Required if ' 'discordant pair file is hosted remotely.') parser.add_argument('--medianfile', default=None, help='Median coverage statistics for each library ' '(optional). If provided, each sample\'s split ' 'counts will be normalized accordingly. ' 'Same format as RdTest, one column per sample.') if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.vcf in '- stdin'.split(): vcf = pysam.VariantFile(sys.stdin) else: vcf = pysam.VariantFile(args.vcf) if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = args.fout header = 'name sample count'.split() args.fout.write('\t'.join(header) + '\n') if args.samples is not None: whitelist = [s.strip() for s in args.samples.readlines()] else: whitelist = [s for s in vcf.header.samples] if args.index is not None: discfile = pysam.TabixFile(args.disc, index=args.index) else: if args.disc.startswith('http'): raise Exception('Must provide tabix index with remote URL') discfile = pysam.TabixFile(args.disc) if args.medianfile is not None: medians = pd.read_table(args.medianfile) medians = pd.melt(medians, var_name='sample', value_name='median_cov') else: medians = None petest = PETest(discfile, args.common, args.window_in, args.window_out, medians=medians) for record in vcf: counts = petest.load_counts(record, args.window_in, args.window_out) counts = petest.normalize_counts(counts) counts = counts.set_index('sample') counts = counts.reindex(whitelist).fillna(0).astype(int) counts = counts.reset_index() counts['name'] = record.id cols = 'name sample count'.split() for row in counts[cols].as_matrix(): fout.write('\t'.join([str(x) for x in row]) + '\n')
def read_octopus_header_info(vcf_filename): vcf = ps.VariantFile(str(vcf_filename)) for record in vcf.header.records: if record.key == "octopus": return dict(record) return None
def format_gdc_vcf( input_vcf: str, output_vcf: str, patient_barcode: str, case_id: str, tumor_barcode: str, tumor_aliquot_uuid: str, tumor_bam_uuid: str, normal_barcode: str, normal_aliquot_uuid: str, normal_bam_uuid: str, *, reference_name: str = "GRCh38.d1.vd1.fa", ) -> None: """ Adds VCF header metadata specific to the GDC. :param input_vcf: The input VCF file to format. :param output_vcf: The output formatted VCF file to create. BGzip and tabix-index created if ends with '.gz'. :param patient_barcode: The case submitter id. :param case_id: The case uuid. :param tumor_barcode: The tumor aliquot submitter id. :param tumor_aliquot_uuid: The tumor aliquot uuid. :param tumor_bam_uuid: The tumor bam uuid. :param normal_barcode: The normal aliquot submitter id. :param normal_aliquot_uuid: The normal aliquot uuid. :param normal_bam_uuid: The normal bam uuid. :param reference_name: Reference name to use in header. """ logger = Logger.get_logger("format_gdc_vcf") logger.info("Format GDC tumor/normal paired VCFs.") # setup reader = pysam.VariantFile(input_vcf) mode = get_pysam_outmode(output_vcf) # Load new header new_header = build_header( reader, patient_barcode, case_id, tumor_barcode, tumor_aliquot_uuid, tumor_bam_uuid, normal_barcode, normal_aliquot_uuid, normal_bam_uuid, reference_name, ) writer = pysam.VariantFile(output_vcf, mode=mode, header=new_header) # Process try: for record in reader.fetch(): writer.write(record) finally: reader.close() writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = pysam.tabix_index(output_vcf, preset="vcf", force=True)
def main(): description = 'Process a .gvcf file to create a file of consensus '\ 'variants, low-frequency variants and a coverage ' \ 'mask ' parser = argparse.ArgumentParser(description=description) parser.add_argument('-m', '--mask-output', required=True, help=f"The output file name for the coverage " f"mask\n") parser.add_argument('-v', '--variants-output', required=True, help=f"The output file name for variants (" f"non-reference gVCF records)\n") parser.add_argument('-c', '--consensus-sites-output', required=True, help=f"The output file name " f"for variants that will " f"be applied to generate " f"the consensus " f"sequence\n") parser.add_argument('-d', '--min-depth', type=int, default=10, help=f"Mask reference positions with depth " f"less than this threshold") parser.add_argument('-l', '--lower-ambiguity-frequency', type=float, default=0.15, help=f"Variants " f"with " f"frequency " f"less than -l " f"will be " f"discarded") parser.add_argument('-u', '--upper-ambiguity-frequency', type=float, default=0.75, help=f"Substitution " f"variants " f"with " f"frequency " f"less than -u " f"will be " f"encoded with " f"IUPAC " f"ambiguity " f"codes") parser.add_argument('file', action='store', nargs=1) args = parser.parse_args() vcf = pysam.VariantFile(open(args.file[0], 'r')) # Initialize depth mask to all zeros for all contigs contig_depth = defaultdict(list) for r in vcf.header.records: if r.type == "CONTIG": contig_depth[r['ID']] = [0] * int(r['length']) out_header = vcf.header # open the output file with the filtered variant sites out_header.info.add("VAF", number="A", type='Float', description="Variant allele fraction, called " "from observed reference/alt " "reads") variants_out = pysam.VariantFile(args.variants_output, 'w', header=out_header) # open the output file with the changes to apply to the consensus # fasta this includes an additional tag in the VCF file out_header.info.add("ConsensusTag", number=1, type='String', description="The type of base to be included " "in the consensus sequence (IUPAC" " or Fixed)") consensus_sites_out = pysam.VariantFile(args.consensus_sites_output, 'w', header=out_header) for record in vcf: is_gvcf_ref = record.alts[0] == "<*>" # set depth for this part of the genome # this works for both gVCF blocks and regular variants # because pos/stop are set appropriately v_start = record.pos v_end = record.stop depth = record.info["DP"] # disallow gvcf records that are longer than a single base assert (not is_gvcf_ref or v_start == v_end) # update depth mask for i in range(v_start, v_end + 1): assert (i > 0) # VCF coordinates are 1-based, we record the depth vector # as 0-based to be consistent with artic-mask contig_depth[record.chrom][i - 1] = depth # do nothing else with ref records, or records that don't # meet our minimum depth if is_gvcf_ref or depth < args.min_depth: continue # determine if any allele in the variant is an indel has_indel = False for i in range(0, len(record.alts)): has_indel = has_indel or len(record.ref) != len(record.alts[i]) # process the input variant record to handle multi-allelic # variants and MNPs out_records = list() if has_indel: # indels need to be handle specially as we can't apply # ambiguity codes out_records = handle_indel(out_header, record) else: out_records = handle_sub(out_header, record) # classify variants using VAF cutoffs for IUPAC ambiguity # codes, etc accept_variant = False for out_r in out_records: # at this point we should have resolved multi-allelic # variants assert (len(out_r.alts) == 1) vaf = out_r.info["VAF"][0] is_indel = len(out_r.ref) != len(out_r.alts[0]) # discard low frequency variants if vaf < args.lower_ambiguity_frequency: continue # Write a tag describing what to do with the variant consensus_tag = "None" # high-frequency subs and indels are always applied # without ambiguity # we don't have to do an indel VAF check here as it is # dealt with in handle_indel if vaf > args.upper_ambiguity_frequency or is_indel: # always apply these to the consensus consensus_tag = "fixed" else: # record ambiguous SNPs in the consensus sequence # with IUPAC codes consensus_tag = "ambiguous" out_r.info["ConsensusTag"] = consensus_tag consensus_sites_out.write(out_r) accept_variant = True if accept_variant: record.info["VAF"] = calculate_vafs(record) variants_out.write(record) write_depth_mask(args.mask_output, contig_depth, args.min_depth)
return None, rec.info["DP"], af else: alt_counts = None if alt_counts is None or depth is None or depth == 0: return None, None, None else: freq = float(alt_counts) / float(depth) return alt_counts, depth, freq def _cur_workdir(data): return utils.safe_makedir(os.path.join(data["dirs"]["work"], "heterogeneity", dd.get_sample_name(data), "bubbletree")) if __name__ == "__main__": import sys bcf_in = pysam.VariantFile(sys.argv[1]) somatic = collections.namedtuple("Somatic", "normal_name,tumor_name") params = {"min_freq": 0.4, "max_freq": 0.6, "min_depth": 15} for rec in bcf_in: if _is_possible_loh(rec, bcf_in, params, somatic(sys.argv[2], sys.argv[3])): print(rec.filter.keys(), len(rec.filter)) _script = """ .libPaths(c("{local_sitelib}")) library(BubbleTree) library(GenomicRanges) library(ggplot2) vc.df = read.csv("{vcf_csv}", header=T)
def openFile(self, filename): return pysam.VariantFile(filename)