Пример #1
0
def is_empty(vcf):
    """
    Return True if the vcf file contains at least one variant, this doesn't use pysam
    :param vcf: Path to possibly-gzipped vcf file
    """
    for line in smartfile(vcf):
        if line and not line.startswith('#'):
            return False
    return True
Пример #2
0
def batch_variants(vcf, max_batch_size=1000, min_safe_dist=2000):
    """
    Given a list of variants, group them into batches such that no batch contains two variants
    whose start positions are within min_safe_dist bases of each other
    :param vcf: Filename of VCF file containing variants to batch
    :param max_batch_size: Maximum number of variants per batch
    :param min_safe_dist: Min permissible distance between two variants in batch
    :return: List of VCF files containing subsets of variants
    """

    batches = []
    header = []

    for x in smartfile(vcf):
        if x.startswith('#'):
            header.append(x)
        else:
            break

    name = strip_extensions(os.path.basename(vcf), ['gz','vcf'])

    vars = list(pysam.VariantFile(vcf))

    while vars:
        var = vars.pop(0)
        unfilled_batches = [b for b in batches if len(b) < max_batch_size]
        found = False
        for b in unfilled_batches:
            if canadd(var, b, max_batch_size, min_safe_dist=min_safe_dist):
                b.append(var)
                found = True
                break

        if not found:
            # Need to make a new batch for this variant, it doesn't fit anywhere
            batches.append([var])

    # return batches
    files = []
    for i, batch in enumerate(batches, 1):
        batchname = '{}.batch{:03d}.'.format(name, i) + randstr() + ".vcf"
        with open(batchname, 'w') as out:
            for x in header:
                out.write(x)
            for x in batch:
                out.write(str(x))
        files.append(batchname)
    return files
Пример #3
0
def sort_vcf(vcf, conf):
    tmpfile = vcf.replace(".vcf", ".sort" + randstr() + ".vcf").replace(".gz", "")
    vars = []
    ofh = open(tmpfile, "w")

    for line in smartfile(vcf):
        if line.startswith('#'):
            ofh.write(line)
        else:
            vars.append(line.split('\t'))

    for var in sorted(vars, cmp=var_comp):
        ofh.write('\t'.join(var))

    ofh.close()

    return bgz_tabix(tmpfile, conf)
Пример #4
0
def set_genotypes(orig_vcf, newGT, region, conf, compress_result=True):
    """
    Create a new VCF file that is identical to the given VCF, except that all GT info fields are set to 'newGT'
    """
    # FIXME: Update to use pysam.VariantFile
    newvcf = '{}.gtmod.{}.vcf'.format(strip_extensions(orig_vcf, ['gz','vcf']), randstr())
    ofh = open(newvcf, 'w')
    for line in smartfile(orig_vcf):
        if line.startswith('#'):
            ofh.write(line)
        else:
            toks  = line.split('\t')
            chrom = toks[0]
            start = int(toks[1])

            # FIXME: Interval intersection logic off?  stop=start+rlen is not taken into consideration.
            if region is not None and (chrom != region.chrom or start < region.start or start >= region.stop):
                ofh.write(line)
                continue

            if len(toks) < 10:
                ofh.write(line)
            else:
                if ',' in toks[4]:
                    raise GTModException('Cant set GT for multi-alt variants.')
                if toks[4] == VCF_MISSING:
                    infoitems = ['0']
                else:
                    infoitems = [newGT]
                if ':' in toks[9]:
                    infoitems.extend(toks[9].strip().split(':')[1:])
                newinfo = ':'.join(infoitems)
                ofh.write('\t'.join(toks[0:9] + [newinfo]) + '\n')

    ofh.close()
    if compress_result:
        newvcf = bgz_tabix(newvcf, conf)

    return newvcf