def is_empty(vcf): """ Return True if the vcf file contains at least one variant, this doesn't use pysam :param vcf: Path to possibly-gzipped vcf file """ for line in smartfile(vcf): if line and not line.startswith('#'): return False return True
def batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000): """ Given a list of variants, group them into batches such that no batch contains two variants whose start positions are within min_safe_dist bases of each other :param vcf: Filename of VCF file containing variants to batch :param max_batch_size: Maximum number of variants per batch :param min_safe_dist: Min permissible distance between two variants in batch :return: List of VCF files containing subsets of variants """ batches = [] header = [] for x in smartfile(vcf): if x.startswith('#'): header.append(x) else: break name = strip_extensions(os.path.basename(vcf), ['gz','vcf']) vars = list(pysam.VariantFile(vcf)) while vars: var = vars.pop(0) unfilled_batches = [b for b in batches if len(b) < max_batch_size] found = False for b in unfilled_batches: if canadd(var, b, max_batch_size, min_safe_dist=min_safe_dist): b.append(var) found = True break if not found: # Need to make a new batch for this variant, it doesn't fit anywhere batches.append([var]) # return batches files = [] for i, batch in enumerate(batches, 1): batchname = '{}.batch{:03d}.'.format(name, i) + randstr() + ".vcf" with open(batchname, 'w') as out: for x in header: out.write(x) for x in batch: out.write(str(x)) files.append(batchname) return files
def sort_vcf(vcf, conf): tmpfile = vcf.replace(".vcf", ".sort" + randstr() + ".vcf").replace(".gz", "") vars = [] ofh = open(tmpfile, "w") for line in smartfile(vcf): if line.startswith('#'): ofh.write(line) else: vars.append(line.split('\t')) for var in sorted(vars, cmp=var_comp): ofh.write('\t'.join(var)) ofh.close() return bgz_tabix(tmpfile, conf)
def set_genotypes(orig_vcf, newGT, region, conf, compress_result=True): """ Create a new VCF file that is identical to the given VCF, except that all GT info fields are set to 'newGT' """ # FIXME: Update to use pysam.VariantFile newvcf = '{}.gtmod.{}.vcf'.format(strip_extensions(orig_vcf, ['gz','vcf']), randstr()) ofh = open(newvcf, 'w') for line in smartfile(orig_vcf): if line.startswith('#'): ofh.write(line) else: toks = line.split('\t') chrom = toks[0] start = int(toks[1]) # FIXME: Interval intersection logic off? stop=start+rlen is not taken into consideration. if region is not None and (chrom != region.chrom or start < region.start or start >= region.stop): ofh.write(line) continue if len(toks) < 10: ofh.write(line) else: if ',' in toks[4]: raise GTModException('Cant set GT for multi-alt variants.') if toks[4] == VCF_MISSING: infoitems = ['0'] else: infoitems = [newGT] if ':' in toks[9]: infoitems.extend(toks[9].strip().split(':')[1:]) newinfo = ':'.join(infoitems) ofh.write('\t'.join(toks[0:9] + [newinfo]) + '\n') ofh.close() if compress_result: newvcf = bgz_tabix(newvcf, conf) return newvcf