def main(fname_in, fname_out, ambiguous_base_coverage_threshold): """ ambiguous_base_coverage_threshold: frequency threshold to include a variant in computation of ambiguous code """ vcf_reader = VCF(fname_in) vcf_writer = Writer(fname_out, vcf_reader) for variant in vcf_reader: base_list = [variant.REF] + variant.ALT coverage_list = variant.INFO.get("AD") total_coverage = sum(coverage_list) assert len(base_list) == len(coverage_list) # genotype 0 is reference (base is not really needed) genotype = [ i for i, (base, coverage) in enumerate(zip(base_list, coverage_list)) if coverage / total_coverage >= ambiguous_base_coverage_threshold ] variant.genotypes = [[*genotype, False]] vcf_writer.write_record(variant) vcf_writer.close() vcf_reader.close()
def count_TP_FP_FN(directory_combined_caller, type_combined_caller, directory_individual_caller): vcf = VCF( f'{directory_combined_caller}/{type_combined_caller}.sorted.vcf.gz') TP = 0 FP = 0 for variant in vcf: if variant.INFO.get('TruScore'): TP += 1 else: FP += 1 vcf.close() with open(f'{directory_individual_caller}/summary.txt', 'r') as fh_individual: counts = json.load(fh_individual) event_count = counts['TP-base'] + counts['FN'] with open(f'{directory_combined_caller}/counts.json', 'w') as fh_combined: json.dump( { 'TP-base': TP, # evaluate.ipynb assumes the existence of this key 'FP': FP, 'FN': event_count - TP, # evaluate.ipynb assumes the existence of this key }, fh_combined, indent=2)
def filter_annotate_calls(): parser = argparse.ArgumentParser(description='') parser.add_argument('--alignments', type=str, help='') parser.add_argument('--regions', type=str, help='') parser.add_argument('--calls', type=str, help='') parser.add_argument('--parameters', type=str, help='') args = parser.parse_args() import json parameters = json.load(open('{}.json'.format(args.parameters))) vcf = VCF(args.calls + '.vcf.gz') vcf.add_info_to_header({ 'ID': 'Confidence', 'Description': 'Measure of confidence in call based upon unitig structure', 'Type': 'String', 'Number': '1' }) with pysam.AlignmentFile(args.alignments + '.bam', 'rb') as unitigs, gzip.open( args.regions + '.bed.gz', 'rt') as regions: print(vcf.raw_header, end='') for region in regions: chromosome, start, end = region.strip().split('\t') region = '{}:{}-{}'.format(chromosome, start, end) for variant in vcf(region): retain_call, call_confidence = retainCall_reportConfidence( unitigs, variant, region, parameters) if retain_call: print(annotate(variant, call_confidence), end='') vcf.close()
def find_SVs(): parser = argparse.ArgumentParser(description='') parser.add_argument('--calls', type=str, help='') parser.add_argument('--svtype', type=str, help='') parser.add_argument('--parameters', type=str, help='') args = parser.parse_args() import json parameters = json.load(open('{}.json'.format(args.parameters))) variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls) svtype = args.svtype if svtype not in ['DEL', 'INS']: print('svtype', svtype, 'not permitted!', file=sys.stderr) sys.exit(1) print(variants.raw_header, end="") for variant in variants: # Decomposition may cause vcf records with genotype "0/0" to appear. # These should be removed because truvari flags these as FPs, artificially inflating the FP rate if hom_ref(variant): continue minSVSize = int(parameters['filterCalls']['minSVSize']) if get_svtype(variant) == svtype and abs(get_sv_length(variant)) >= minSVSize: print(variant, end='') variants.close()
def write_pass_vcf(annotated_vcf): out_vcf = re.sub(r'\.annotated\.vcf\.gz$', '.annotated.pass.vcf', annotated_vcf) vcf = VCF(annotated_vcf) w = Writer(out_vcf, vcf) num_rejected = 0 num_pass = 0 for rec in vcf: if rec.FILTER is None or rec.FILTER == 'None': w.write_record(rec) num_pass += 1 else: num_rejected += 1 vcf.close() w.close() logger.info('Number of non-PASS/REJECTED variant calls: ' + str(num_rejected)) logger.info('Number of PASSed variant calls: ' + str(num_pass)) if num_pass == 0: logger.warning( 'There are zero variants with a \'PASS\' filter in the VCF file') os.system('bgzip -dc ' + str(annotated_vcf) + ' egrep \'^#\' > ' + str(out_vcf)) #else: os.system('bgzip -f ' + str(out_vcf)) os.system('tabix -f -p vcf ' + str(out_vcf) + '.gz') return
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_in.add_info_to_header({ 'ID': 'SVLEN', 'Description': 'length of sv', 'Type': 'Integer', 'Number': '1' }) vcf_in.add_info_to_header({ 'ID': 'SVTYPE', 'Description': 'type of sv - just DEL or INS based on SVLEN', 'Type': 'String', 'Number': '1' }) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if abs(len(v.REF) - max([len(alt) for alt in v.ALT])) > 49: v.INFO["SVLEN"] = max([len(alt) for alt in v.ALT]) - len(v.REF) if v.INFO["SVLEN"] > 0: v.INFO["SVTYPE"] = "INS" else: v.INFO["SVTYPE"] = "DEL" vcf_out.write_record(v) vcf_in.close() vcf_out.close()
def compute_min_SV_size(): parser = argparse.ArgumentParser(description='') parser.add_argument('--calls', type=str, help='') parser.add_argument('--svtype', type=str, help='') args = parser.parse_args() variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls) svtype = args.svtype if svtype not in ['DEL', 'INS']: print('svtype', svtype, 'not permitted!', file=sys.stderr) sys.exit(1) min_size_variant = None min_size = 10000 for variant in variants: # Decomposition may cause vcf records with genotype "0/0" to appear. # These should be removed if hom_ref(variant): continue if get_svtype(variant) == svtype and abs( get_sv_length(variant)) < min_size: min_size_variant = variant min_size = get_sv_length(variant) variants.close() print('min-size variant: {}'.format(str(min_size_variant)), end='') print('min size: {}'.format(min_size)) print()
def extract_pharmcat_pgx_regions(tabix_executable_path, input_vcf, output_dir, input_ref_pgx_vcf): ''' extract pgx regions in input_ref_pgx_vcf from input_vcf and save variants to path_output ''' print( 'Modify chromosome names.\nExtract PGx regions based on the input reference PGx position file.' ) path_output = os.path.join( output_dir, obtain_vcf_file_prefix(input_vcf) + '.pgx_regions.vcf.gz') input_vcf_cyvcf2 = VCF(input_vcf) input_ref_pgx_pos_cyvcf2 = VCF(input_ref_pgx_vcf) # get pgx regions in each chromosome input_ref_pgx_pos_pandas = allel.vcf_to_dataframe(input_ref_pgx_vcf) input_ref_pgx_pos_pandas['CHROM'] = input_ref_pgx_pos_pandas[ 'CHROM'].replace({ 'chr': '' }, regex=True).astype(str).astype(int) ref_pgx_regions = input_ref_pgx_pos_pandas.groupby( ['CHROM'])['POS'].agg(get_vcf_pos_min_max).reset_index() # fix chr names chr_name_match = re.compile("^chr") if any(chr_name_match.match(line) for line in input_vcf_cyvcf2.seqnames): # chromosomes have leading 'chr' characters in the original VCF # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1).replace({'^': 'chr'}, regex=True) else: # chromosomes do not have leading 'chr' characters in the original VCF # add chromosome name with leading 'chr' to the VCF header for single_chr in input_vcf_cyvcf2.seqnames: input_vcf_cyvcf2.add_to_header('##contig=<ID=chr' + single_chr + '>') # pgx regions to be extracted ref_pgx_regions = ref_pgx_regions.apply( lambda row: ':'.join(row.values.astype(str)), axis=1) # write to a VCF output file # header output_vcf_cyvcf2 = Writer(path_output, input_vcf_cyvcf2, mode="wz") # content for single_region in ref_pgx_regions: for single_variant in input_vcf_cyvcf2(single_region): single_variant.CHROM = re.sub(r'^([0-9]+)', r'chr\1', single_variant.CHROM) output_vcf_cyvcf2.write_record(single_variant) # close pipe input_vcf_cyvcf2.close() input_ref_pgx_pos_cyvcf2.close() output_vcf_cyvcf2.close() tabix_index_vcf(tabix_executable_path, path_output) return path_output
def main(invcf: str = typer.Argument(..., help="输入的vcf文件"), outvcf: str = typer.Argument(..., help="输出的vcf文件"), mindepth: int = typer.Option(10, help="最低reads覆盖率"), het_altrange: Tuple[float, float] = typer.Option((0.2, 0.8), help="杂合位点的alt频率范围"), homref_maxaltrate: float = typer.Option( 0, help="纯合ref型GT,最大alt reads比例不超过这个"), homalt_minaltrate: float = typer.Option( 1, help="纯合alt型GT,最小alt reads比例不低于这个")): """ mask掉满足以下的genotype: 杂合位点alt reads的频率不在20%到80%范围之内的。 纯合位点reads支持比例不是100%的。 覆盖的reads小于10条的。 """ vcf = VCF(invcf) w = Writer(outvcf, vcf) for v in vcf: indicies_mask = filter_samples(v, mindepth, het_altrange, homref_maxaltrate, homalt_minaltrate) if indicies_mask: for index in indicies_mask: v.genotypes[index] = [-1] * v.ploidy + [False] v.genotypes = v.genotypes w.write_record(v) w.close() vcf.close()
def open_vcf(path: PathType) -> Iterator[VCF]: """A context manager for opening a VCF file.""" vcf = VCF(path) try: yield vcf finally: vcf.close()
def augment_vcf(vcf_in_file, vcf_out_file, bed_files, decimals): """Parses and augments VCF file.""" # Read in the input VCF file vcf_in = VCF(vcf_in_file) # Add rows to the header for each new field vcf_in = modify_header(vcf_in, bed_files) # Set up a write based on the tweaked input VCF file vcf_out = Writer(vcf_out_file, vcf_in) # Parse BED files bed = parse_bed_files(bed_files) # Iterate over every variant record for variant in vcf_in: # Augment the variant by adding new fields (if there are samples) num_samples = len(vcf_in.samples) if num_samples > 0: variant = add_fields_to_variant(variant, bed, decimals) # Output the augmented variant vcf_out.write_record(variant) # Close input and output VCF files vcf_in.close() vcf_out.close()
def output_pharmcat_ready_vcf(input_vcf, output_dir, output_prefix): ''' iteratively write to a PharmCAT-ready VCF for each sample "bcftools view <options> <input_vcf>". For bcftools common options, see running_bcftools(). "-U" exclude sites without a called genotype, i.e., GT = './.' ''' input_vcf_cyvcf2 = VCF(input_vcf) input_vcf_sample_list = input_vcf_cyvcf2.samples input_vcf_sample_list.remove('PharmCAT') input_vcf_cyvcf2.close() # output each single sample to a separete VCF for single_sample in input_vcf_sample_list: print('Generating a PharmCAT-ready VCF for ' + single_sample) input_vcf_cyvcf2 = VCF(input_vcf, samples=single_sample) # write to a VCF output file output_file_name = os.path.join( output_dir, output_prefix + '.' + single_sample + '.vcf') # header output_vcf_cyvcf2 = Writer(output_file_name, input_vcf_cyvcf2, mode='w') # content for single_var in input_vcf_cyvcf2: output_vcf_cyvcf2.write_record(single_var) output_vcf_cyvcf2.close() input_vcf_cyvcf2.close()
def seperate_vcffile(self): # start = time.time() file_list = self.search_vcf_file(self.from_directory) for file in file_list: vcf_read = VCF(file) samples = vcf_read.samples chromosome_num = "" for variant in vcf_read: chromosome_num = variant.CHROM break for sample in samples: start = time.time() # print(sample, "file write start... ", start) try: if not (os.path.isdir(self.target_directory)): os.makedirs(os.path.join(self.target_directory)) if not (os.path.isdir(self.target_directory + "/" + sample)): os.makedirs( os.path.join(self.target_directory + "/" + sample)) except OSError as e: print("Failed to create directory!!!!!", e) raise filepath = os.path.join(self.target_directory + "/" + sample, chromosome_num + "-" + sample + ".vcf") index = 0 while os.path.exists(filepath): index = index + 1 filepath = os.path.join( self.target_directory + "/" + sample, chromosome_num + "-" + sample + str(index) + ".vcf") out_read_vcf = VCF(file, samples=[sample]) write_file = Writer(filepath, out_read_vcf) for variant in out_read_vcf: if chromosome_num == "Y": if not variant.genotypes[0][0] == 0: write_file.write_record(variant) elif not (variant.genotypes[0][0] == 0 and variant.genotypes[0][1] == 0): write_file.write_record(variant) write_file.close() out_read_vcf.close() with open(filepath, "rb") as f_in: with gzip.open(filepath + ".gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) os.remove(filepath) sec = time.time() - start print(sample + " write end...", time.strftime("%H:%M:%S", time.gmtime(sec))) break vcf_read.close()
def main( vcf_path: str, loci_info: TextIO, outdir: str, verbose: bool, chrom: str, max_indel_len: int, loci_dir: str, ): """Apply all ALT variants in a VCF to their corresponding loci reference sequences. Creates multiple mutants of the reference loci sequence. """ log_level = logging.DEBUG if verbose else logging.INFO logging.basicConfig(format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level) loci_dir = Path(loci_dir) outdir = Path(outdir) if not outdir.exists(): outdir.mkdir() logging.info("Loading loci info file...") ivtree = load_loci_info(loci_info) logging.info(f"Loaded info for {len(ivtree)} loci") vcf = VCF(vcf_path) logging.info("Applying variants to loci...") for iv in ivtree: start = iv.begin # 1-based inclusive end = iv.end # 1-based inclusive loci_name = iv.data loci_path = loci_dir / chrom / f"{loci_name}.fa" loci_record = get_record_for_loci(loci_path) outpath = outdir / loci_path.name with outpath.open("w") as outstream: write_record(loci_record, outstream) count = 0 alt_records: List[Record] = [] for variant in vcf(f"{chrom}:{start}-{end}"): count += 1 alt_records.extend( loci_record.apply_variant(variant, relative_start=start, max_indel_len=max_indel_len)) for rec in alt_records: write_record(rec, outstream) if count < 1: logging.info(f"No records associated with loci {loci_name}") else: logging.debug( f"{count} record(s) associated with loci {loci_name}") vcf.close() logging.info("All done.")
def test_missing_samples(): samples = ['101976-101976', 'sample_not_in_vcf'] vcf = VCF(VCF_PATH, gts012=True, samples=samples) assert len(vcf.samples) == 1 vcf.close() samples = '101976-101976,sample_not_in_vcf' vcf = VCF(VCF_PATH, gts012=True, samples=samples) assert len(vcf.samples) == 1
def main(): args = get_args() vcf_in = VCF(args.vcf) vcf_out = Writer(args.output, vcf_in) for v in vcf_in: if v.INFO["SVLEN"] > 49: vcf_out.write_record(v) vcf_in.close() vcf_out.close()
def test_fd(): fh = open(os.path.join(HERE, "decomposed.vcf")) fn = fh.fileno() vcf = VCF(fn) v = next(vcf) assert np.all(v.gt_types == np.array([vcf.HOM_REF, vcf.HOM_REF, vcf.HET, vcf.HET, vcf.UNKNOWN])) fh.close() vcf.close()
def compute_sv_lengths(): parser = argparse.ArgumentParser(description='') parser.add_argument('--calls', type=str, help='') args = parser.parse_args() variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls) for variant in variants: print(abs(get_sv_length(variant))) variants.close()
def print_vcf(sample): vcf = VCF('/dev/stdin') for header_line in vcf.raw_header.split('\n'): if header_line.startswith('#CHROM'): continue if len(header_line) == 0 : continue print(header_line) print('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}'.format(sample)) for variant in vcf: print(variant, end='') vcf.close()
def iter_vcf(input_file, output_file, proc_rec, proc_hdr=None, postproc_hdr=None, **kwargs): """ :param input_file: path to input VCF file :param output_file: path to output VCF file (can be .vcf or .vcf.gz, but it will always bgzip/tabix and write with .vcf.gz extention) :param proc_rec: a function to process a single cyvcf Record object. Returns either a (new) Record object to write, or None to indicate that the record should be discarded :param proc_hdr: a function to process cyvcf object once (i.e. to add values to the header with vcf.add_info_to_header, etc) :param postproc_hdr: a function to postprocess finalized header string (vcf.rawheader), e.g. in order to remove values :param kwargs: any paramters to pass directly into proc_rec """ from cyvcf2 import VCF vcf = VCF(input_file, gts012=True) if proc_hdr is not None: proc_hdr(vcf) # w = None if output_file is not None: out_ungz, out_gz = get_ungz_gz(output_file) # w = Writer(out_ungz, vcf) # w.write_header() w = open(out_ungz, 'w') else: # sys.stdout.write(vcf.raw_header) w = sys.stdout header = vcf.raw_header if postproc_hdr is not None: header = postproc_hdr(header) w.write(header) for rec in vcf: if proc_rec: rec_res = proc_rec(rec, vcf, **kwargs) if rec_res is not None: # if w is not None: # sys.stderr.write('Writing record', rec_res, '\n') # w.write_record(rec_res) # else: # print(rec_res) # sys.stderr.write(f'Writing record {rec_res}\n') w.write(f'{rec_res}') sys.stderr.write(f'Finished writing {output_file}\n') vcf.close() if output_file is not None: w.close() run_simple(f'bgzip -f {out_ungz} && tabix -f -p vcf {out_gz}') sys.stderr.write(f'Compressed {output_file}\n')
def compute_sv_coordinates(): parser = argparse.ArgumentParser(description='') parser.add_argument('--calls', type=str, help='') args = parser.parse_args() variants = VCF('/dev/stdin') if args.calls == 'stdin' else VCF(args.calls) for variant in variants: chromosome = variant.CHROM start, end = coordinates(variant) print('{}\t{}\t{}\n'.format(chromosome, start, end), end='') variants.close()
def main(): args = get_args() vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect = 0 for v in vcf: if v.REF == v.ALT[0] and v.INFO["SVTYPE"] == "DEL": v.ALT = "<DEL>" incorrect += 1 output.write_record(v) print("Fixed {} positions".format(incorrect)) output.close() vcf.close()
def main(): args = get_args() genome = Fasta(args.genome) vcf = VCF(args.vcf) output = Writer(args.output, vcf) incorrect_reference = 0 for v in vcf: ref_nucl = get_reference_nucleotide(v.CHROM, v.start, genome) if v.REF != ref_nucl: v.REF = ref_nucl incorrect_reference += 1 output.write_record(v) print("Fixed {} positions".format(incorrect_reference)) output.close() vcf.close()
def main(): parser = argparse.ArgumentParser(__doc__) parser.add_argument("--vcf", help="VCF file", type=str, required=True) parser.add_argument("--statsfile", help="File with chrom, start, locus stats", type=str, required=True) parser.add_argument("--out", help="Prefix for output files", type=str, required=True) parser.add_argument("--min-hwep", help="Minimum HWE p-value", type=float, default=0) parser.add_argument("--min-callrate", help="Minimum call rate", type=float, default=0) parser.add_argument("--min-het", help="Minimum heterozygosity", type=float, default=0) parser.add_argument("--max-hrun-offset", help="For periods 5+, discard if the ref has " \ "homopolymer run > period+offset", type=int, default=100000) parser.add_argument("--filter-segdup", help="Filter loci overlapping a segdup", action="store_true") args = parser.parse_args() # Get VCF reader reader = VCF(args.vcf) # Load locus filters sys.stderr.write("Getting filters...\n") locstats = pd.read_csv(args.statsfile, sep="\t") locstats["FILTER"] = locstats.apply(lambda x: GetFilters(x, args, len(reader.samples)), 1) locstats.to_csv(args.out + ".tab", sep="\t", index=False) # Get filter dictionary sys.stderr.write("Getting filter dictionary...\n") filter_dict = dict(zip(list(locstats["start"]), list(locstats["FILTER"]))) # Set filter field sys.stderr.write("Setting filter field in VCFs...\n") adict = { "HWE": "HWE less than %s"%args.min_hwep, "Callrate": "Callrate less than %s"%args.min_callrate, "Het": "Het less than %s"%args.min_het, "Hrun": "Hrun greater than %s"%args.max_hrun_offset, "Segdup": "Locus in a segmental duplication", "MissingInfo": "No stats provided for the locus", } for f in adict: reader.add_filter_to_header({"ID": f, "Description": adict[f]}) writer = Writer("/dev/stdout", reader) for record in reader: filters = filter_dict.get(record.INFO["START"], "MissingInfo") if filters != ".": record.FILTER = filters.split(";") else: record.FILTER = "PASS" writer.write_record(record) writer.close() reader.close()
def parseVCF(invcf,outbasename,reportMultipleSamples,reportNoSamples): vcf_data = VCF(invcf,gts012=True) samples=vcf_data.samples # print(len(samples)) multiple_samples=defaultdict(list) absent_samples=copy.deepcopy(samples) with open(outbasename + "_genotypes.txt",'w') as out: out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format("CHROM","POS","REF","ALT","NUM_HET","HET_SAMPLES","NUM_HOM_ALT","HOM_ALT_SAMPLES")) for record in vcf_data: home_var= (record.gt_types==2).nonzero()[0] heterozygous = (record.gt_types==1).nonzero()[0] # unknown = (record.gt_types==3).nonzero()[0] # genotypes=record.genotypes samples_het,samples_homvar = [],[] if heterozygous.size: [(samples_het.append(samples[i]),multiple_samples[samples[i]].append((record.CHROM,str(record.POS)))) for i in heterozygous] if reportNoSamples: [absent_samples.remove(samples[i]) for i in heterozygous if samples[i] in absent_samples] if home_var.size: [(samples_homvar.append(samples[i]),multiple_samples[samples[i]].append((record.CHROM,str(record.POS)))) for i in home_var] if reportNoSamples: [absent_samples.remove(samples[i]) for i in home_var if samples[i] in absent_samples] # else: # print(record.genotypes) out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(record.CHROM, record.POS, record.REF, record.ALT[0], record.num_het, ';'.join(samples_het),record.num_hom_alt,';'.join(samples_homvar))) out.close() vcf_data.close() if multiple_samples and reportMultipleSamples: outlist = [[k,v]for k, v in multiple_samples.items() if len(v) > 1] if not outlist: print("{}".format("No samples carry more than one variant represented in the input VCF.")) else: with open(outbasename + "_multipleVariantSamples.txt",'w') as outmult: outmult.write("{}\t{}\t{}\n".format("Sample","#variants","variant_id")) for sample in outlist: outmult.write("{}\t{}\t{}\n".format(sample[0],len(sample[1]),';'.join('_'.join(i) for i in sample[1]))) outmult.close() if reportNoSamples: if absent_samples: with open(outbasename + "_samplesWithNoVariant.txt","w") as out: out.write('\n'.join(absent_samples)) else: print("All samples have a variant in the given VCF. Are you sure the input VCF is a subset from the original? If it's not," "the '-n' flag doesn't make sense.")
def process_vcf(vcf): vcf_data = VCF(vcf, gts012=True) vcf_data.add_info_to_header({ 'ID': 'Gene_SpliceAI', 'Description': 'Gene for which spliceAI gave the prediction.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_AG', 'Description': 'SpliceAI score for an acceptor gain.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_AL', 'Description': 'SpliceAI score for an acceptor lost.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_DG', 'Description': 'SpliceAI score for a donor gain.', 'Type': 'String', 'Number': '.' }) vcf_data.add_info_to_header({ 'ID': 'DS_DL', 'Description': 'SpliceAI score for a donor lost.', 'Type': 'String', 'Number': '.' }) print(vcf_data.raw_header.rstrip()) for record in vcf_data: snvs = record.INFO.get('SpliceAI') indels = record.INFO.get('SpliceAI_ind') if snvs: record = set_new_fields(record, snvs) elif indels: record = set_new_fields(record, indels) print(str(record).rstrip()) vcf_data.close()
def get_variants(vcf_file, padding, sv_padding, vcf_parse=None): """ Given a vcf file, this function parses through the file and yields the variant with all relevant information Args: vcf_file (string): Path to vcf file Yields: variant (mutacc.builds.build_variant.Variant): Variant object """ vcf_file = parse_path(vcf_file) vcf = VCF(str(vcf_file), "r") samples = vcf.samples parser = None if vcf_parse: parser = INFOParser(vcf_parse, "read") for entry in vcf: yield Variant(entry, samples, padding, sv_padding, parser=parser) vcf.close()
def main( in_vcf: str, out_vcf: str, keep_mnps: bool, verbose: bool, ): """Extract SNPs from a pandora VCF. It keeps records regardless of the called allele, provided they record is a SNP. If the ALT allele is a '.' and the REF is a single character, it is considered a SNP. It is assumed that each record has no more than 1 alternate allele. """ log_level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level ) vcf_reader = VCF(in_vcf) vcf_writer = Writer(out_vcf, tmpl=vcf_reader) logging.info("Checking for records to keep...") for record in vcf_reader: keep_this_record = False ref_len = len(record.REF) ref_is_single_base = ref_len == 1 empty_alt = not bool(record.ALT) if empty_alt and (keep_mnps or ref_is_single_base): keep_this_record = True elif all(ref_len == len(alt) for alt in record.ALT) and ( ref_is_single_base or keep_mnps ): keep_this_record = True if keep_this_record: vcf_writer.write_record(record) else: logging.debug( f"Discarding record CHROM: {record.CHROM} at POS: {record.POS}" ) vcf_writer.close() vcf_reader.close() logging.info("Done!")
def main(vcf_path: str, loci_info: TextIO, output: str, verbose: bool, chrom: str): """Associate information about loci to the relevant VCF records based on position. This script will add three new INFO fields to the VCF. The INFO fields are loci_name, start, and end. See the VCF header entries for these fields for more information.""" log_level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( format="%(asctime)s [%(levelname)s]: %(message)s", level=log_level ) logging.info("Loading loci info file...") ivtree = load_loci_info(loci_info) vcf = VCF(vcf_path) write_new_info_fields(vcf) vcf_writer = Writer(output, vcf) logging.info("Associating loci info to VCF records...") for iv in ivtree: start = iv.begin end = iv.end name = iv.data count = 0 for record in vcf(f"{chrom}:{start}-{end}"): count += 1 record.INFO[LOCI_ID] = name record.INFO[START_ID] = start record.INFO[END_ID] = end vcf_writer.write_record(record) if count < 1: logging.info(f"No records associated with loci {name}") else: logging.debug(f"{count} record(s) associated with loci {name}") vcf_writer.close() vcf.close() logging.info("All done.")
def add_absent_records(vcf_absent_gnomad, outfile, nind): #format_fields = get_format_fields_from_vcf(vcf_absent_gnomad) logging.info("Processing variants absent in gnomAD") gt, gt_dp, gt_ref_depth, gt_alt_depth, gt_qual = "0/0", 100, 100, 0, 50 gt_phred_ll_homref, gt_phred_ll_het, gt_phred_ll_homalt = 0, 1500, 1500 fmt = [ "{}:{},{}:{}:{}:{},{},{}".format(gt, gt_ref_depth, gt_alt_depth, gt_dp, gt_qual, gt_phred_ll_homref, gt_phred_ll_het, gt_phred_ll_homalt) ] * nind vcf_data = VCF(vcf_absent_gnomad, gts012=True) info_fields = [ field["ID"] for field in vcf_data.header_iter() if field["HeaderType"] == "INFO" ] with open(outfile, 'a') as out: #with gzip.open(outfile, 'ab') as out: for record in vcf_data: str_info = [] for i in info_fields: try: str_info.append(i + "=" + str(record.INFO[i])) except KeyError: continue write_record = [ '.' if v is None else v for v in [ record.CHROM, str(record.POS), record.ID, record.REF, record.ALT[0], str(record.QUAL), record.FILTER, ';'.join(str_info), "GT:AD:DP:GQ:PL" ] ] out.write('\t'.join(write_record + fmt) + "\n") #out.write('\t'.join(write_record + fmt).encode() + "\n".encode()) vcf_data.close() out.close()