def windowpairs_from_vcf(chrom, vcf_file_list, sv_type_list): ''' Function generates chromosome wide window pairs using VCF files. One can specify the types of SVs within window pairs. :param chrom: List of chromosomes :param vcf_file_list: List of paths to VCF files (One per caller) :param sv_type_list: List of SVTypes (DEL,INV,BND,INS,DUP) for which windows are generated :return: ''' window_pairs = set() for vcf_file in vcf_file_list: assert os.path.isfile(vcf_file) vcf_in = VariantFile(vcf_file, 'r') caller = re.findall(r'^\w*', vcf_file) lostSV_logfile = open("Excluded_SVs_" + caller[0] + ".log", 'w') lostSV_logfile.write(str(vcf_in.header) + "\n") for rec in vcf_in.fetch(): svrec = SVRecord_generic(rec, caller[0]) startCI = abs(svrec.cipos[0]) + svrec.cipos[1] endCI = abs(svrec.ciend[0]) + svrec.ciend[1] if startCI > 200 or endCI > 200 or svrec.start == svrec.end: lostSV_logfile.write(str(rec) + "\n") elif svrec.chrom == chrom and svrec.svtype in sv_type_list: window_pairs.add( StructuralVariant(Breakpoint(svrec.chrom, svrec.start), Breakpoint(svrec.chrom, svrec.end))) vcf_in.close() lostSV_logfile.close() return window_pairs
def file_process(fname): try: cpath = fname.rstrip('\n') sys.stderr.write("Processing " + cpath + "\n") sys.stderr.flush() in_vcf = VariantFile(cpath) # pdb.set_trace() for cat in tbl_dict: for key in tbl_dict[cat]: getattr(in_vcf.header, cat)[key].remove_header() in_vcf.header.add_meta(cat_dict[cat], items=[('ID', key), ('Number', getattr(good_boy.header, cat)[key].number), ('Type', getattr(good_boy.header, cat)[key].type), ('Description', getattr(good_boy.header, cat)[key].description)]) # pdb.set_trace() out_vcf = VariantFile("-", 'w', header=in_vcf.header) for rec in in_vcf.fetch(): out_vcf.write(rec) out_vcf.close() except Exception as e: sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n")
def vcf_file_to_regions(in_file: Union[str, os.PathLike]): vcf = VariantFile(in_file, mode="r") try: # VariantFile automatically opens file for variant in vcf: # type: VariantRecord yield BedRegion(variant.contig, variant.start, variant.stop) finally: # Make sure vcf is always closed vcf.close()
class VcfAugmenter(ABC): def __init__(self, in_path, command_line, out_file=sys.stdout): """ in_path -- Path to input VCF, used as template. command_line -- A string that will be added as a VCF header entry (use None to not add this to the VCF header) out_file -- Open file-like object to which VCF is written. tag -- which type of tag to write, either 'PS' or 'HP'. 'PS' is standardized; 'HP' is compatible with GATK’s ReadBackedPhasing. """ # TODO This is slow because it reads in the entire VCF one extra time contigs, formats, infos = missing_headers(in_path) # We repair the header (adding missing contigs, formats, infos) of the *input* VCF because # we will modify the records that we read, and these are associated with the input file. self._reader = VariantFile(in_path) augment_header(self._reader.header, contigs, formats, infos) if command_line is not None: command_line = '"' + command_line.replace('"', "") + '"' self._reader.header.add_meta("commandline", command_line) self.setup_header(self._reader.header) self._writer = VariantFile(out_file, mode="w", header=self._reader.header) self._unprocessed_record = None self._reader_iter = iter(self._reader) @abstractmethod def setup_header(self, header): pass def close(self): self._writer.close() def __enter__(self): return self def __exit__(self, *args): self.close() @property def samples(self): return list(self._reader.header.samples) def _iterrecords(self, chromosome): """Yield all records for the target chromosome""" n = 0 if self._unprocessed_record is not None: assert self._unprocessed_record.chrom == chromosome yield self._unprocessed_record n += 1 for record in self._reader_iter: n += 1 if record.chrom != chromosome: # save it for later self._unprocessed_record = record assert n != 1 return yield record
def to_arrow(vfname, batchparams, cols, nested_props=("FILTER", "FORMAT")): """Convert `VariantRecord` batches to Arrow `RecordBatch`es The returned Arrow buffer breaks compatibility with a standard VCF column header: ALT -> ALTS. This is because `pysam.VariantRecord` does this, and it makes sense. This also significantly reduces code complexity. The keys nested under the INFO column are completely free-form, so they are detected automatically from the VCF file header. During conversion to Arrow buffer, filling these fields present a significant book-keeping challenge (they can also be nested!). So we opt to fill these semi-automatically, and any absent fields are set to NULL (thanks to Arrow!). Note, while converting to other formats, these may need to be filled by reasonable alternatives; which might come at a cost. For example, Pandas does not support NULLs, and a likely replacement would be numpy.na. This means you firstly lose zero-copy conversion, and possibly convert the field type to a float! Beware. vfname -- Variant file name to be opened with `VariantFile` batchparams -- Parameters to get VariantRecord batch iterator cols -- Record column spec (as returned by get_vcf_cols(..)) returns list of `RecordBatch`es """ batch = [] vf = VariantFile(vfname, mode="r", threads=4) # FIXME: for vrec in vf.fetch(*batchparams): # break compatibility with VCF file column header: ALT -> ALTS. # INFO_* fields are filtered out as they are handled separately later. row = OrderedDict((c, getattr(vrec, c.lower())) for c in cols if c in _simple_vcf_cols) # vrec.{prop}: [('<filter>', <pysam.libcbcf.VariantMetadata>)] row.update((prop, [key for key in getattr(vrec, prop.lower()).keys()]) for prop in nested_props) # missing INFO_* fields are treated as NULLs (see doc string) row.update((f"INFO_{k}", v) for k, v in vrec.info.items()) # reverse the layout: fmt in sample -> sample in fmt. this way # for a given FORMAT field, all samples will be in adjacent blocks. row.update( (f"{fmt}_{sample.name}", (int(sample.phased), *sample.values()[i])) for i, fmt in enumerate(row["FORMAT"]) for sample in vrec.samples.values()) # NOTE: indexing above slows the generator expr by a factor of two. # indexing relies on the fixed ordering of FORMAT field values. batch.append(row) vf.close() # FIXME: # from pprint import pprint # pprint(batch[-1]) # populate as struct -> flatten batch = pa.array(batch, type=pa.struct(cols)).flatten() return pa.RecordBatch.from_arrays(batch, pa.schema(cols))
def decompose_multiallelic_record(in_vcf, out_vcf): """Break records with multiple ALT alleles into multiple records.""" i_vcf = VariantFile(in_vcf, "r") raw_out = out_vcf.strip(".gz") o_vcf = VariantFile(raw_out, "w", header=i_vcf.header) for record in i_vcf: # Only mutect put multiple ALTs in one record number_events = len(record.alts) # Temporary fix due to segfault # see https://github.com/leukgen/click_mergevcfs/issues/2 if number_events >= 8: continue elif number_events > 1: click.echo("file={},pos={}".format(in_vcf, record.pos)) for i in range(0, number_events): new_rec = record.copy() new_rec.alts = tuple([record.alts[i]]) # Multiallic sites GT are ex. 0/1/2, which causes error later # Needs to change to ./. genotypes = list(record.samples) for g in genotypes: # Overwrite GT new_rec.samples[g]["GT"] = (None, None) # Use none_if_tuple_out_of_idx because # record.samples[g]['AD'] would sometimes return # a tuple of (None,) if "AD" in list(record.samples[g]): new_rec.samples[g]["AD"] = ( record.samples[g]["AD"][0], none_if_tuple_out_of_idx(t=record.samples[g]["AD"], index=i + 1), ) if "AF" in list(record.samples[g]): new_rec.samples[g]["AF"] = none_if_tuple_out_of_idx( t=record.samples[g]["AF"], index=i) if "F1R2" in list(record.samples[g]): new_rec.samples[g]["F1R2"] = ( record.samples[g]["F1R2"][0], none_if_tuple_out_of_idx( t=record.samples[g]["F1R2"], index=i + 1), ) if "F2R1" in list(record.samples[g]): new_rec.samples[g]["F2R1"] = ( record.samples[g]["F2R1"][0], none_if_tuple_out_of_idx( t=record.samples[g]["F2R1"], index=i + 1), ) o_vcf.write(new_rec) else: o_vcf.write(record) o_vcf.close() subprocess.check_call(["bgzip", "-f", raw_out])
def filter_somatic(in_vcf_path, out_vcf_path): in_vcf = VariantFile(in_vcf_path) out_vcf = VariantFile(out_vcf_path, 'w', header=in_vcf.header) num_skipped_records = 0 for rec in in_vcf: if is_somatic(rec): try: out_vcf.write(rec) except OSError: num_skipped_records += 1 print("Skipped " + str(num_skipped_records) + " bad records") in_vcf.close() out_vcf.close()
def filter_bcf_file(self, bcf_file): bcf_in = VariantFile(bcf_file,'rb') bcf_out = VariantFile("%s.target.vcf" % bcf_file[:-4],'w',header=bcf_in.header) for rec in bcf_in.fetch(): if rec.contig == self.contig_id: if self.contig_start == False and self.contig_end == False: pass else: if rec.pos >= self.contig_start and rec.pos <= self.contig_end: bcf_out.write(rec) bcf_in.close() bcf_out.close()
def main(): vcf = VariantFile(snakemake.input.vcf) outlier_table = pd.read_table(snakemake.input.outliers) filtered = VariantFile(snakemake.output[0], mode='w', header=vcf.header) outliers = defaultdict(list) for idx, row in outlier_table.iterrows(): outliers[row['svtype']].append(row['sample']) for record in remove_outliers(vcf, outliers): filtered.write(record) filtered.close()
def main(): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--vcf', help='Input vcf', required=True) parser.add_argument( '--dict', help='Tab-delimited sample id conversion table', required=True) args = parser.parse_args() vcf = VariantFile(args.vcf) id_dict = get_id_dictionary(args.dict) new_ids = get_new_ids(vcf, id_dict) print_ids(new_ids) vcf.close()
def main(): vcf_path = sys.argv[1] vcf = VariantFile(vcf_path, 'r') contigs = set() for record in vcf: contigs.add(record.chrom) vcf.close() vcf = VariantFile(vcf_path, 'r') for contig in sorted(contigs): vcf.header.add_line("##contig=<ID={}>".format(contig)) print(vcf.header, end="") for record in vcf: print(record, end="")
def prepare_octopus_vcf_for_rtg(octopus_vcf, tumour_sample, out_vcf_name): """" Octopus reports non-diploid genotypes for somatic variants. """ in_vcf = VariantFile(octopus_vcf) out_vcf = VariantFile(out_vcf_name, 'w', header=in_vcf.header) n_failed = 0 for record in in_vcf: old_gt = record.samples[tumour_sample]['GT'] assert (len(old_gt) > 1) somatic_allele = next(a for a in reversed(list(old_gt)) if a is not None and a > 0) record.samples[tumour_sample]['GT'] = (old_gt[0], somatic_allele) try: out_vcf.write(record) except OSError: n_failed += 1 out_vcf.close() index(out_vcf_name)
def dtoxog_maf_to_vcf(input_maf: str, reference_fa: str, output_vcf: str) -> None: """ Transforms dToxoG MAF to minimal VCF of only dtoxo failures. :param input_maf: The annotated dtoxog MAF output file. :param reference_fa: Reference fasta used to make seqdict header. :param output_vcf: The output minimal VCF with only failed dtoxog records BGzip and tabix-index created if ends with '.gz'. """ logger = Logger.get_logger("dtoxog_maf_to_vcf") logger.info("Transforms dToxoG MAF to minimal VCF of dtoxo failures") # setup total = 0 written = 0 tag = "oxog" # header header = generate_header(reference_fa, tag) # Writer mode = get_pysam_outmode(output_vcf) writer = VariantFile(output_vcf, mode=mode, header=header) # Process try: with open(input_maf, "rt") as fh: for record in maf_generator(fh): total += 1 if record["oxoGCut"] == "1": new_vcf_record = build_new_record(record, writer, tag) writer.write(new_vcf_record) written += 1 finally: writer.close() if mode == "wz": logger.info("Creating tabix index...") tbx = tabix_index(output_vcf, preset="vcf", force=True) logger.info("Processed {} records - Wrote {}".format(total, written))
def add_PASSED_field(in_vcf, out_vcf): """ Add PASSED_{caller} fields. Add flags (e.g. PASSED_caveman) under INFO for PASS variant in aim of reduce ambiguity of confident variants in the merged vcf. """ # see logic of merging INFO fields # https://github.com/vcftools/vcftools/blob/490848f7865abbb4b436ca09381ea7912a363fe3/src/perl/vcf-merge caller = get_caller(in_vcf) i_vcf = VariantFile(in_vcf, "rb") new_header = i_vcf.header.copy() try: new_header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) i_vcf.header.info.add( "PASSED_{}".format(caller), ".", "Flag", "this variants passed which caller(s)", ) except ValueError: pass raw_out = out_vcf.strip(".gz") o_vcf = VariantFile(raw_out, "w", header=new_header) for record in i_vcf: new_rec = record.copy() filters = list(record.filter) if filters and filters[0] == "PASS": new_rec.info["PASSED_{}".format(caller)] = 1 o_vcf.write(new_rec) o_vcf.close() subprocess.check_call(["bgzip", "-f", raw_out])
def main(): parser = argparse.ArgumentParser("find_outliers.py") parser.add_argument("input", type=str, help="list of samples names") parser.add_argument("output", type=str, help="list of samples names") parser.add_argument("outliers", type=str, help="list of samples names") args = parser.parse_args() #vcf = VariantFile(snakemake.input.vcf) vcf = VariantFile(args.input) outlier_table = pd.read_table(args.outliers) filtered = VariantFile(args.output, mode='w', header=vcf.header) outliers = defaultdict(list) for idx, row in outlier_table.iterrows(): outliers[row['svtype']].append(row['sample']) for record in remove_outliers(vcf, outliers): filtered.write(record) filtered.close()
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtools standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) template = pkg_resources.resource_filename('svtools', 'data/standard_template.vcf') template = VariantFile(template) vcf = VariantFile(args.vcf) # Template header includes all necessary FILTER, INFO, and FORMAT fields # Just need to add samples from VCF being standardized header = template.header for sample in vcf.header.samples: header.add_sample(sample) # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout) idx = 1 for record in standardizer.standardize_vcf(): if any_called(record) or args.include_reference_sites: if args.prefix is not None: record.id = '{0}_{1}'.format(args.prefix, idx) idx += 1 fout.write(record) # for std_rec in standardize_vcf(vcf, fout): # fout.write(std_rec) fout.close() vcf.close()
class VcfReader: """ Read a VCF file chromosome by chromosome. """ def __init__( self, path, indels=False, phases=False, genotype_likelihoods=False, ignore_genotypes=False, ploidy=None, ): """ path -- Path to VCF file indels -- Whether to include also insertions and deletions in the list of variants. ignore_genotypes -- In case of genotyping algorithm, no genotypes may be given in vcf, so ignore all genotypes ploidy -- Ploidy of the samples """ # TODO Always include deletions since they can 'overlap' other variants self._indels = indels self._vcf_reader = VariantFile(path) self._path = path self._phases = phases self._genotype_likelihoods = genotype_likelihoods self._ignore_genotypes = ignore_genotypes self.samples = list( self._vcf_reader.header.samples) # intentionally public self.ploidy = ploidy logger.debug("Found %d sample(s) in the VCF file.", len(self.samples)) def __enter__(self): return self def __exit__(self, *args): # follows same structure as for ReadSetReader self.close() def close(self): self._vcf_reader.close() @property def path(self): return self._vcf_reader.filename.decode() def _fetch(self, chromosome: str, start=0, end=None): try: records = self._vcf_reader.fetch(chromosome, start=start, stop=end) except ValueError as e: if "invalid contig" in e.args[0]: raise VcfInvalidChromosome(e.args[0]) from None elif "fetch requires an index" in e.args[0]: raise VcfIndexMissing( "{} is missing an index (.tbi or .csi)".format( self._path)) from None else: raise return records def fetch(self, chromosome: str, start=0, end=None): """ Fetch records from a single chromosome, optionally restricted to a single region. Return a VariantTable object. """ records = list(self._fetch(chromosome, start=start, end=end)) return self._process_single_chromosome(chromosome, records) def fetch_regions(self, chromosome: str, regions): """ Fetch records from a single chromosome that overlap the given regions. :param regions: a list of start, end tuples (end can be None) """ records = [] for start, end in regions: records.extend(list(self._fetch(chromosome, start=start, end=end))) return self._process_single_chromosome(chromosome, records) def __iter__(self): """ Yield VariantTable objects for each chromosome. Multi-ALT sites are skipped. """ for chromosome, records in itertools.groupby( self._vcf_reader, lambda record: record.chrom): yield self._process_single_chromosome(chromosome, records) @staticmethod def _extract_HP_phase(call): hp = call.get("HP") if hp is None or hp == (".", ): return None fields = [[int(x) for x in s.split("-")] for s in hp] for i in range(len(fields)): assert fields[0][0] == fields[i][0] block_id = fields[0][0] phase = tuple(field[1] - 1 for field in fields) return VariantCallPhase(block_id=block_id, phase=phase, quality=call.get("PQ", None)) @staticmethod def _extract_GT_PS_phase(call): is_het = not all(x == call["GT"][0] for x in call["GT"]) if not is_het: return None if not call.phased: return None block_id = call.get("PS", 0) phase = call["GT"] return VariantCallPhase(block_id=block_id, phase=phase, quality=call.get("PQ", None)) def _process_single_chromosome(self, chromosome, records): phase_detected = None n_snvs = 0 n_other = 0 n_multi = 0 table = VariantTable(chromosome, self.samples) prev_position = None for record in records: if len(record.alts) > 1: # Multi-ALT sites are not supported, yet n_multi += 1 continue pos, ref, alt = record.start, str(record.ref), str(record.alts[0]) if len(ref) == len(alt) == 1: n_snvs += 1 else: n_other += 1 if not self._indels: continue if (prev_position is not None) and (prev_position > pos): raise VcfNotSortedError( "VCF not ordered: {}:{} appears before {}:{}".format( chromosome, prev_position + 1, chromosome, pos + 1)) if prev_position == pos: logger.warning( "Skipping duplicated position %s on chromosome %r", pos + 1, chromosome, ) continue prev_position = pos # Read phasing information (allow GT/PS or HP phase information, but not both), # if requested if self._phases: phases = [] for sample_name, call in record.samples.items(): phase = None for extract_phase, phase_name in [ (self._extract_HP_phase, "HP"), (self._extract_GT_PS_phase, "GT_PS"), ]: p = extract_phase(call) if p is not None: if phase_detected is None: phase_detected = phase_name elif phase_detected != phase_name: raise MixedPhasingError( "Mixed phasing information in input VCF (e.g. mixing PS " "and HP fields)") phase = p # check for ploidy consistency and limits phase_ploidy = len(p.phase) if phase_ploidy > get_max_genotype_ploidy(): raise PloidyError( "Ploidies higher than {} are not supported." "".format(get_max_genotype_ploidy())) elif p is None or None in p: pass elif self.ploidy is None: self.ploidy = phase_ploidy elif phase_ploidy != self.ploidy: print("phase= {}".format(phase)) raise PloidyError( "Phasing information contains inconsistent ploidy ({} and " "{})".format(self.ploidy, phase_ploidy)) phases.append(phase) else: phases = [None] * len(record.samples) # Read genotype likelihoods, if requested if self._genotype_likelihoods: genotype_likelihoods = [] for call in record.samples.values(): GL = call.get("GL", None) PL = call.get("PL", None) # Prefer GLs (floats) over PLs (ints) if both should be present if GL is not None: genotype_likelihoods.append(GenotypeLikelihoods(GL)) elif PL is not None: genotype_likelihoods.append( GenotypeLikelihoods([pl / -10 for pl in PL])) else: genotype_likelihoods.append(None) else: genotype_likelihoods = [None] * len(record.samples) if not self._ignore_genotypes: # check for ploidy consistency and limits genotype_lists = [ call["GT"] for call in record.samples.values() ] for geno in genotype_lists: geno_ploidy = len(geno) if geno_ploidy > get_max_genotype_ploidy(): raise PloidyError( "Ploidies higher than {} are not supported." "".format(get_max_genotype_ploidy())) elif geno is None or None in geno: pass elif self.ploidy is None: self.ploidy = geno_ploidy elif geno_ploidy != self.ploidy: raise PloidyError("Inconsistent ploidy ({} and " "{})".format(self.ploidy, geno_ploidy)) genotypes = [ genotype_code(geno_list) for geno_list in genotype_lists ] else: genotypes = [Genotype([]) for i in range(len(self.samples))] phases = [None] * len(self.samples) variant = VcfVariant(position=pos, reference_allele=ref, alternative_allele=alt) table.add_variant(variant, genotypes, phases, genotype_likelihoods) logger.debug( "Parsed %s SNVs and %s non-SNVs. Also skipped %s multi-ALTs.", n_snvs, n_other, n_multi, ) # TODO remove overlapping variants return table
#coding:utf-8 from sys import argv from os.path import exists import os import pysam import numpy as np from pysam import VariantFile script, bam_file, vcf_file, output_bam_file = argv bamfile = pysam.AlignmentFile(bam_file, "rb") vcffile = VariantFile(vcf_file) output_bamfile = pysam.AlignmentFile(output_bam_file, "wb", template=bamfile) for rec in vcffile.fetch(): for read in bamfile.fetch(): if (rec.pos == read.pos): output_bamfile.write(read) output_bamfile.close() bamfile.close() vcffile.close()
dest='call_vcf', help='Called vcf to search for variants not found in reference vcf') parser.add_argument( '-o', '--out-vcf', action='store', dest='out_vcf', help='Output vcf that is a subset of called vcf meeting criteria') args = parser.parse_args() ref_vcf = VariantFile(args.ref_vcf) called_vcf = VariantFile(args.call_vcf, threads=4) out_vcf = VariantFile(args.out_vcf, "w", header=called_vcf.header, threads=4) x = 0 m = 1000 for record in called_vcf.fetch(): if x % m == 0: sys.stderr.write('Processed ' + str(x) + " records\n") sys.stderr.flush() f = 0 for comp in ref_vcf.fetch(record.contig, record.start, record.stop): if record.pos == comp.pos and record.alleles == comp.alleles: f = 1 break if not f: out_vcf.write(record) x += 1 out_vcf.close() ref_vcf.close() called_vcf.close()
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtools vcfcluster', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('filelist', type=argparse.FileType('r'), help='List of paths to standardized VCFS') parser.add_argument('fout', help='Clustered VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-r', '--region', default=None, help='Restrict clustering to genomic region.') parser.add_argument('-d', '--dist', type=int, default=500, help='Maximum clustering distance. Suggested to use ' 'max of median + 7*MAD over samples. [500]') parser.add_argument('-f', '--frac', type=float, default=0.1, help='Minimum reciprocal overlap between variants. ' '[0.1]') parser.add_argument('-x', '--blacklist', metavar='BED.GZ', type=TabixFile, default=None, help='Tabix indexed bed of blacklisted regions. Any ' 'SV with a breakpoint falling inside one of these ' 'regions is filtered from output.') parser.add_argument('-z', '--svsize', type=int, default=500, help='Minimum SV size to report for intrachromosomal ' 'events. [0]') parser.add_argument('-p', '--prefix', default='MERGED', help='Prefix for merged variant IDs. [MERGED]') parser.add_argument('-t', '--svtypes', default='DEL,DUP,INV,BND', help='Comma delimited list of svtypes to cluster ' '[DEL,DUP,INV,BND]') parser.add_argument('--preserve-ids', action='store_true', default=False, help='Include list of IDs of constituent records in ' 'each cluster.') # parser.add_argument('--cluster-bed', type=argparse.FileType('w'), # help='Bed of constituent calls in each cluster') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Parse SV files and lists of samples and sources filepaths = [line.strip() for line in args.filelist.readlines()] vcfs = parse_filepaths(filepaths) svtypes = args.svtypes.split(',') svc = VCFCluster(vcfs, dist=args.dist, blacklist=args.blacklist, frac=args.frac, svtypes=svtypes, region=args.region, preserve_ids=args.preserve_ids) # Open new file if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = open(args.fout, 'w') fout = VariantFile(fout, mode='w', header=svc.header) for i, record in enumerate(svc.cluster()): # Name record if args.prefix: name = [args.prefix] else: name = ['SV'] name.append(args.source) if args.region: chrom = args.region.split(':')[0] name.append(chrom) name.append(str(i + 1)) record.id = '_'.join(name) fout.write(record) # Size filter (CTX have size -1) if -1 < record.info['SVLEN'] < args.svsize: continue # if args.cluster_bed is not None: # flatten_pos(cluster, record.ID, args.cluster_bed) fout.close()
if genotype[0] == genotype[2]: if genotype[0] == '0': REF_HOMO += 1 else: ALT_HOMO += 1 else: HET += 1 sys.stdout.write('%s\t%d\t%d\t%d\t%d\n' % ('\t'.join(out), REF_HOMO, HET, ALT_HOMO, MISS)) # Start file reading from here. infile = VariantFile('-', 'r') #sys.stdout.write(str(infile.header)) for line in infile: ss = str(line).strip().split() setoutGenoArrayIndex(ss[8]) if OUT_FORMAT == 'ALT_FRE': outputAlleleFrequency(ss) elif OUT_FORMAT == 'GP_GENO': outputGPGenotype(ss) elif OUT_FORMAT == 'GT_GENO': outputGTGenotype(ss) infile.close() sys.stdout.flush() sys.stdout.close() sys.stderr.flush() sys.stderr.close()
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk standardize', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Raw VCF.') parser.add_argument('fout', help='Standardized VCF.') parser.add_argument('source', help='Source algorithm. ' '[delly,lumpy,manta,wham,melt]') parser.add_argument('-p', '--prefix', help='If provided, variant names ' 'will be overwritten with this prefix.') parser.add_argument('--include-reference-sites', action='store_true', default=False, help='Include records where all ' 'samples are called 0/0 or ./.') parser.add_argument('--standardizer', help='Path to python file with ' 'custom standardizer definition. (Not yet supported.)') parser.add_argument('--contigs', type=argparse.FileType('r'), help='Reference fasta index (.fai). If provided, ' 'contigs in index will be used in VCF header. ' 'Otherwise all GRCh37 contigs will be used in header. ' 'Variants on contigs not in provided list will be ' 'removed.') parser.add_argument('--min-size', type=int, default=50, help='Minimum SV size to report [50].') parser.add_argument('--call-null-sites', action='store_true', default=False, help='Call sites with null genotypes (./.). Generally ' 'useful when an algorithm has been run on a single ' 'sample and has only reported variant sites.') parser.add_argument('--sample-names', type=str, default=None, help='Comma-delimited list of sample names to use in ' 'header [use existing].') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) # Add contigs to header if provided if args.contigs: template = pkg_resources.resource_filename( 'svtk', 'data/no_contigs_template.vcf') template = VariantFile(template) header = template.header contig_line = '##contig=<ID={contig},length={length}>' for line in args.contigs: contig, length = line.split()[:2] header.add_line(contig_line.format(**locals())) # Use GRCh37 by default else: template = pkg_resources.resource_filename('svtk', 'data/GRCh37_template.vcf') template = VariantFile(template) header = template.header vcf = VariantFile(args.vcf) # Parse new sample names if provided if args.sample_names: sample_names_list = args.sample_names.split(',') else: sample_names_list = vcf.header.samples # Tag source in header meta = '##FORMAT=<ID={0},Number=1,Type=Integer,Description="Called by {1}"' meta = meta.format(args.source, args.source.capitalize()) header.add_line(meta) header.add_line('##source={0}'.format(args.source)) fout = VariantFile(args.fout, mode='w', header=header) standardizer = VCFStandardizer.create(args.source, vcf, fout, sample_names_list, args.prefix, args.min_size, args.include_reference_sites, args.call_null_sites) for record in standardizer.standardize_vcf(): fout.write(record) fout.close() vcf.close()
# Add annotation to cols and write to file ### cols.append(str(rsid)) cols.append(str(allele)) cols.append(str(gene)) cols.append(str(annotation)) cols.append(str(hgvs_c)) cols.append(str(hgvs_p)) for info in infos_out: if type(info) is tuple or type(info) is list: info_str = [str(s) for s in info] cols.append(",".join(info_str)) else: cols.append(str(info)) output_handle.write("\t".join(cols)) output_handle.write("\n") ###### # Clean up ###### if input_handle is not None: input_handle.close() if output_handle is not None: output_handle.close() if vcf_handle is not None: vcf_handle.close() print "Complete!"
def setoutGenoArrayIndex(oldFormatTags): outGenoArrayIndex.clear() ss = oldFormatTags.upper().split(':') for x in tags: try: y = ss.index(x) outGenoArrayIndex.append(y) except ValueError: sys.stderr.write('ERROR: can not find tag: "%s", from input vcf FORMAT field.\n'%(x)) sys.exit(-1) infile = VariantFile('-', 'r') sys.stdout.write(str(infile.header)) for line in infile: ss = str(line).strip().split() out = ss[:vcfMetaCols] out[8] = otags #update tags genotyp tags info. setoutGenoArrayIndex(ss[8]) #Check format line by line. for x in ss[vcfMetaCols:]: #if not outGenoArrayIndex: # setoutGenoArrayIndex(ss[8]) out.append(reformat(x)) sys.stdout.write('%s\n'%('\t'.join(out))) infile.close() sys.stdout.flush() sys.stdout.close() sys.stderr.flush() sys.stderr.close()
all_coords = set(first_coords + second_coords) all_coords = sorted(list(all_coords)) #print(all_coords) # main loop for site in all_coords: match_xlist = [rec for rec in vcf_primary.fetch() if rec.pos==site] match_ylist = [rec for rec in vcf_secondary.fetch() if rec.pos==site] if len(match_xlist) == 0: # no match recx_coverage = -1 # any positive number is larger than this else: assert len(match_xlist) == 1 recx = match_xlist[0] recx_coverage = recx.info["DP"] if len(match_ylist) == 0: # no match recy_coverage = -1 # any positive number is larger than this else: assert len(match_ylist) == 1 recy = match_ylist[0] recy_coverage = recy.info["DP"] if recy_coverage > recx_coverage: vcf_out.write(recy) else: vcf_out.write(recx) vcf_primary.close() vcf_secondary.close() vcf_out.close()
def read_vcf(fh, alleles, slh=None): vcf_in = VariantFile(fh) sample = list(vcf_in.header.samples)[0] availcols = next(vcf_in.fetch()).format.keys() vcf_in.seek(0) # Check if sample size info is in header global_fields = [x for x in vcf_in.header.records if x.key == "SAMPLE"][0] if alleles: dtype_dict = {'SNP': str, 'Z': float, 'N': float, 'A1': str, 'A2': str} usecols = list(dtype_dict.keys()) # Read in data if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.alts[0], rec.ref ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N, 'A1': pd.Series([x[2 + int('SS' in availcols)] for x in o], dtype='str'), 'A2': pd.Series([x[3 + int('SS' in availcols)] for x in o], dtype='str') }) else: dtype_dict = {'SNP': str, 'Z': float, 'N': float} usecols = list(dtype_dict.keys()) if 'SS' in availcols: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0], rec.samples[sample]['SS'][0] ] for rec in vcf_in.fetch()] N = pd.Series([x[2] for x in o], dtype='float') else: o = [[ rec.id, rec.samples[sample]['ES'][0] / rec.samples[sample]['SE'][0] ] for rec in vcf_in.fetch()] if 'TotalControls' in global_fields.keys( ) and 'TotalCases' in global_fields.keys(): N = pd.Series([ float(global_fields['TotalControls']) + float(global_fields['TotalCases']) ] * len(o), dtype='float') elif 'TotalControls' in global_fields.keys(): N = pd.Series([float(global_fields['TotalControls'])] * len(o), dtype='float') else: N = pd.Series([np.NaN] * len(o), dtype='float') p = pd.DataFrame({ 'SNP': pd.Series([x[0] for x in o], dtype='str'), 'Z': pd.Series([x[1] for x in o], dtype='float'), 'N': N }) vcf_in.close() if slh is not None: compression = get_compression(slh) sl = [] if compression == "gzip": try: with gzip.open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) else: try: with open(slh) as f: for line in f: sl.append(line.strip()) except (AttributeError, ValueError) as e: raise ValueError('Improperly formatted snplist file: ' + str(e.args)) f.close() p = p.loc[p['SNP'].isin(sl)] return (p)
def write_vcf(self, path): vcf = VariantFile(path, 'w', header=self.header) for variant in self.filtered_variants: vcf.write(variant.pysam_rec) vcf.close()
def main(): args = process_input() chrom_vcf = args.chrom_vcf min_r2 = args.min_r2 min_maf = args.min_maf out_prefix = args.out_prefix r2_field_name = args.r2_field_name maf_field_name = args.maf_field_name new_ids = args.new_ids #### # Read new ids in dictionary #### new_ids_dict = dict() if new_ids is not None: with open(new_ids, "r") as f: for line in f: old_id, new_id = line.rstrip().split("\t") new_ids_dict[old_id] = new_id print "Ids {0} ids to remap".format(len(new_ids_dict)) out_vcf_list = "{0}.vcf_list.tsv".format(out_prefix) out_vcf_list_handle = open(out_vcf_list, "w") for chrom, vcf in chrom_vcf.iteritems(): chrom_match = re.match("(chr)?(.+)", chrom) if chrom_match is not None: chrom = chrom_match.group(2) else: raise ValueError( "Chomosome name {0} not formatted correctly!".format(chrom)) out_vcf_name = "{0}.chr{1}.vcf".format(out_prefix, chrom) out_vcf_name_gz = "{0}.chr{1}.vcf.gz".format(out_prefix, chrom) out_vcf_name_gz_tbi = "{0}.chr{1}.vcf.gz.tbi".format(out_prefix, chrom) print "Processing chr{0} {1}...".format(chrom, vcf) in_vcf_handle = VariantFile(vcf) pass_filter = in_vcf_handle.header.filters["PASS"] out_vcf_list_handle.write("{0}\t{1}".format(chrom, out_vcf_name_gz)) out_vcf_list_handle.write("\n") #### # It appears that writing to a BCF is the only method that works in this version of pysam #### #'wb' for BCF # #out_vcf_handle = VariantFile(out_vcf_name,'wb',header=in_vcf_handle.header) #out_vcf_handle = pysam.libcbgzf.BGZFile(out_vcf_name,"wb") #out_vcf_handle.write(str(in_vcf_handle.header)) #cmd = "bgzip -c > {0}".format(out_vcf_name) #print cmd out_vcf_handle = open(out_vcf_name, "w") print "Relabeling and writing header..." relabeled_ids = 0 old_header_lines = str(in_vcf_handle.header).split("\n") for line in old_header_lines: if line == "": continue if re.match("^#CHROM.+", line): cols = line.split("\t") for i in range(9, len(cols)): if cols[i] in new_ids_dict: relabeled_ids += 1 cols[i] = new_ids_dict[cols[i]] #merge new columns new_line = "\t".join(cols) out_vcf_handle.write(new_line) else: out_vcf_handle.write(line) #write new line out_vcf_handle.write("\n") print "Relabeled {0} ids".format(relabeled_ids) rec_count = 0 for rec in in_vcf_handle: rec_count += 1 if rec_count % 50000 == 0: print "Line: {0:d} {1}:{2:d}".format(rec_count, rec.chrom, rec.pos) r2 = rec.info[r2_field_name] maf = rec.info[maf_field_name] if r2 > min_r2 and maf > min_maf: #clear filters rec.filter.clear() #set filter to be pass rec.filter.add("PASS") #new lines are already there out_vcf_handle.write(str(rec)) #print "Running bgzip on " ##execute bgzip #bgz_handle = Popen(["bgzip", out_vcf_name]) #bgz_handle.wait() in_vcf_handle.close() out_vcf_handle.close() print "Writing tabix index for {0}...".format(out_vcf_name, preset="vcf") #seems to only compress files pysam.tabix_index(out_vcf_name, preset="vcf") if not os.path.isfile(out_vcf_name_gz_tbi): pysam.tabix_index(out_vcf_name_gz, preset="vcf") if os.path.isfile(out_vcf_name): os.remove(out_vcf_name) out_vcf_list_handle.close() print "Finished writing {0}".format(out_vcf_list) print "Complete!"
windowsizes += [startCI, endCI] windowsizes_by_caller[caller[0]]["CI_sizes"]["All"] += [startCI, endCI] windowsizes_by_caller[caller[0]]["CI_sizes"]["Start"][ svrec.svtype] += [startCI] windowsizes_by_caller[caller[0]]["CI_sizes"]["End"][svrec.svtype] += [ endCI ] windowsizes_by_SVType[svrec.svtype] += [startCI, endCI] if startCI > 200 or endCI > 200: lost_SVs += 1 windowsizes_by_caller[caller[0]]["Lost_SVs"] += 1 SVCount_bytype[svrec.svtype + "_lost"] += 1 elif svrec.svtype == "DEL": print(svrec.chrom + ":" + str(svrec.start) + "-" + str(svrec.end)) vcf_in.close() fraction_lostSVs_allcallers = round(lost_SVs / total_SVs, 4) bins = list(range(0, max(windowsizes), 50)) plt.hist(windowsizes, bins=bins, log=True, edgecolor='black', linewidth=0.5, zorder=3, color="seagreen") plt.xlabel('Breakpoint interval sizes [bp]') plt.ylabel('Counts [log]') plt.title(
def main(argv): parser = argparse.ArgumentParser( description=__doc__, prog='svtk vcfcluster', formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('filelist', type=argparse.FileType('r'), help='List of paths to standardized VCFS') parser.add_argument('fout', help='Clustered VCF.') parser.add_argument('-r', '--region', default=None, help='Restrict clustering to genomic region.') parser.add_argument('-d', '--dist', type=int, default=500, help='Maximum clustering distance. Suggested to use ' 'max of median + 7*MAD over samples. [500]') parser.add_argument('-f', '--frac', type=float, default=0.1, help='Minimum reciprocal overlap between variants. ' '[0.1]') parser.add_argument('-x', '--blacklist', metavar='BED.GZ', type=TabixFile, default=None, help='Tabix indexed bed of blacklisted regions. Any ' 'SV with a breakpoint falling inside one of these ' 'regions is filtered from output.') parser.add_argument('-z', '--svsize', type=int, default=500, help='Minimum SV size to report for intrachromosomal ' 'events. [0]') parser.add_argument('-p', '--prefix', default='MERGED', help='Prefix for merged variant IDs. [MERGED]') parser.add_argument('-t', '--svtypes', default='DEL,DUP,INV,BND', help='Comma delimited list of svtypes to cluster ' '[DEL,DUP,INV,BND]') parser.add_argument('--ignore-svtypes', action='store_true', default=False, help='Ignore svtypes when clustering.') parser.add_argument('-o', '--sample-overlap', type=float, default=0.0, help='Minimum sample overlap for two variants to be ' 'clustered together.') parser.add_argument('--preserve-ids', action='store_true', default=False, help='Include list of IDs of constituent records in ' 'each cluster.') parser.add_argument('--preserve-genotypes', action='store_true', default=False, help='In a set of clustered variants, report best ' '(highest GQ) non-reference genotype when available.') parser.add_argument('--preserve-header', action='store_true', default=False, help='Use header from clustering VCFs') parser.add_argument( '--skip-merge', action='store_true', default=False, help='Do not merge clustered records. Adds CLUSTER info fields.') parser.add_argument( '--merge-only', action='store_true', default=False, help= 'When run on a vcf generated with --skip-merge, only merges records ' 'with identical CLUSTER fields.') parser.add_argument( '--single-end', action='store_true', default=False, help='Require only one end to be within the minimum distance.') # parser.add_argument('--cluster-bed', type=argparse.FileType('w'), # help='Bed of constituent calls in each cluster') # Print help if no arguments specified if len(argv) == 0: parser.print_help() sys.exit(1) args = parser.parse_args(argv) if args.skip_merge and args.merge_only: raise ValueError('Cannot use both --skip-merge and --merge-only') # Parse SV files and lists of samples and sources filepaths = [line.strip() for line in args.filelist.readlines()] vcfs = parse_filepaths(filepaths) svtypes = args.svtypes.split(',') match_svtypes = not args.ignore_svtypes do_merge = not args.skip_merge do_cluster = not args.merge_only svc = VCFCluster(vcfs, dist=args.dist, blacklist=args.blacklist, frac=args.frac, svtypes=svtypes, region=args.region, match_svtypes=match_svtypes, preserve_ids=args.preserve_ids, preserve_genotypes=args.preserve_genotypes, sample_overlap=args.sample_overlap, preserve_header=args.preserve_header, do_cluster=do_cluster, do_merge=do_merge, single_end=args.single_end) # Open new file if args.fout in '- stdout'.split(): fout = sys.stdout else: fout = open(args.fout, 'w') fout = VariantFile(fout, mode='w', header=svc.header) for i, cluster in enumerate(svc.cluster()): if args.prefix: cluster_id = [args.prefix] else: cluster_id = ['SV'] if args.region: chrom = args.region.split(':')[0] cluster_id.append(chrom) if do_merge and do_cluster: cluster_index = i else: cluster_index = cluster[0].info['CLUSTER'] cluster_id.append(str(cluster_index + 1)) cluster_id = '_'.join(cluster_id) for record in cluster: # Name record if do_merge: name = cluster_id else: name = record.id record.id = name fout.write(record) # Size filter (CTX have size -1) if -1 < record.info['SVLEN'] < args.svsize: continue # if args.cluster_bed is not None: # flatten_pos(cluster, record.ID, args.cluster_bed) fout.close()
out_vcf.close() except Exception as e: sys.stderr.write(str(e) + "\n failed to process " + cpath + "\n") if len(sys.argv) == 1: parser.print_help() sys.exit(1) args = parser.parse_args() # pdb.set_trace() tbl_dict = {} cat_dict = {'info': 'INFO', 'formats': 'FORMAT'} for line in open(args.table): (cat, key) = line.rstrip('\n').split('\t') if cat not in tbl_dict: tbl_dict[cat] = [] tbl_dict[cat].append(key) good_boy = VariantFile(args.ex_vcf) file_process(args.in_vcf) # with open(args.in_vcf) as f: # vcf_list = f.read().splitlines() # if len(vcf_list[-1]) < 5: # vcf_list.pop() # with concurrent.futures.ThreadPoolExecutor(32) as executor: # results = {executor.submit(mt_file_process, fpath): fpath for fpath in vcf_list} good_boy.close()