示例#1
0
def generate_sample_vcf(filename='/Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz'):
    """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing"""
    vcf = VCF(filename)
    write = Writer('samp_build38.vcf', vcf)
    write.write_header()
    for chrom_num, chrom_len in REF_GENOME:
        begin = random.randint(1000, chrom_len - 1000)
        os.system(append_variants_to_vcf(chrom_num, begin, begin + 1000))
    write.close()
示例#2
0
def generate_sample_vcf(vcf_path, outfile='samp_build38.vcf'):
    """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing"""
    vcf = VCF(vcf_path)
    write = Writer(outfile, vcf)
    write.write_header()
    key_values = zip(vcf.seqnames, vcf.seqlens)
    chrom_keys = defaultdict(int)
    chroms = get_autosome_names_grch38()
    for kv in list(key_values):
        if kv[0] in chroms:
            chrom_keys[kv[0]] = kv[1]
    for chrom_num, chrom_len in chrom_keys.items():
        begin = random.randint(100000, chrom_len - 100000)
        os.system(
            append_variants_to_vcf(vcf_path,
                                   chrom_num,
                                   begin,
                                   begin + 10_000,
                                   outfile=outfile))
    write.close()
示例#3
0
from os import system
import random
from pyfaidx import Fasta
from cyvcf2 import VCF, Writer


def gen_com(chrom, start, stop):
    return "tabix /Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz " + str(
        chrom) + ":" + str(start) + "-" + str(stop) + " >> samp.vcf"


fa = Fasta('/Users/simonelongo/too_big_for_icloud/REFERENCE_GENOME_GRch37.fa')
vcf = VCF(
    '/Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz'
)
write = Writer('samp.vcf', vcf)
write.write_header()
keys = list(fa.keys())

for key in keys[0:25]:
    if key != 'MT':
        begin = random.randint(1000, len(fa[key]) - 1000)
        system(gen_com(key, begin, begin + 1000))

write.close()
示例#4
0
def parse_records(vcf, gq, out_format, vcf_type, verbose_level, purity_filter,
                  single_sample, out):
    '''
    Iterates through VCF and writes records passing above
    filters to file.

    Parameters
    -------
    vcf : str
        File path to input VCF
    gq : int
        GQ threshold (records >gq kept)
    verbose_level : int
        How much to print to console.
        If 0 - print no progress info besides tqdm bar
        If 1 - print counter at each chromosome completion
        If 2 - print all candidate information as they are found
    vcf_type : str
        [combined|pairs] whether the VCF just contains the 0/5 pairs
        or other samples as well
    out_format : str
        [table|vcf] - whether to write as new VCF or
        as a tab-separated file
    purity_filter : bool
        if enabled, will filter out records where mutated site
        has >2 reads of non mut allele (see check_record)
    single_sample : bool
        if enabled, will assume there is only one sample of interest
        in the VCF (e.g. just one of 0 and 5) - also assumes 'combined' mode
        
    out : str
        Name of file to write to

    Returns
    -------
    None
        Writes to specified file.
    '''
    print(f'the verbose level is {verbose_level}')
    vcf_in = VCF(vcf)
    sample_names = vcf_in.samples
    pair_sample_names = sorted([
        item for item in sample_names
        if item.endswith('_0') or item.endswith('_5')
    ])
    if single_sample:
        try:
            assert len(pair_sample_names) == 1
        except AssertionError as e:
            print(f'[saltMA] ERROR: samples bonked - {pair_sample_names}')
            print('[saltMA] Exiting...')
            sys.exit()

    # get samples
    try:
        if not single_sample:
            sample_lookup = sample_names.index(pair_sample_names[0]), \
                sample_names.index(pair_sample_names[1])
        elif single_sample:
            shortlist = [
                s for s in sample_names
                if single_sample in s and s != pair_sample_names[0]
            ]
            sample_lookup = sample_names.index(shortlist[0]), \
                sample_names.index(pair_sample_names[0])
            print(
                f'[saltMA] selected samples are {pair_sample_names[0]} and {shortlist[0]}'
            )
    except IndexError as e:
        print('[saltMA] ERROR: Samples seem incorrect. '
              'Ensure you have a 0 and 5 sample in the VCF, '
              'unless --single_sample has been selected.')
        print('[saltMA] Exiting...')
        sys.exit()

    print(f'[saltMA] initiating filtering for {os.path.basename(vcf)}...')
    counter = 0
    total_count = 0
    doublemut_count = 0
    prev_chr = None

    if out_format == 'vcf':
        outfile = Writer(out, vcf_in)
        outfile.write_header()
    elif out_format == 'table':
        f = open(out, 'w')
        header_string = '\t'.join([
            'fname', 'chrom', 'pos', 'ref', 'alt', 'gt_bases', 'gt_quals',
            'gt_depths'
        ])
        f.write(header_string + '\n')
        basename = os.path.basename(vcf).replace('.vcf.gz', '')

    for record in tqdm(vcf_in):
        total_count += 1
        check = check_record(record,
                             vcf_type,
                             sample_lookup,
                             gq=gq,
                             purity_filter=purity_filter,
                             single_sample=single_sample)
        if check == 'doublemut':
            if verbose_level == 2:
                tqdm.write(
                    f'[saltMA] doublemut at {record.CHROM}:{record.POS}')
                doublemut_count += 1
                continue
            else:
                doublemut_count += 1
                continue
        elif check:
            counter += 1
            if verbose_level == 1:
                if not prev_chr:
                    prev_chr = record.CHROM
                    tqdm.write(
                        f'[saltMA] first chrom with detected mut is {prev_chr}'
                    )
                    continue
                elif prev_chr != record.CHROM:
                    tqdm.write(f'[saltMA] {prev_chr} completed.')
                    tqdm.write(f'[saltMA] current count is {counter}')
                    prev_chr = record.CHROM
            elif verbose_level == 2:
                tqdm.write(
                    f'[saltMA] candidate mut found at {record.__repr__()}')
                tqdm.write(f'[saltMA] current count is {counter}')
                tqdm.write(f'[saltMA] doublemut count is {doublemut_count}')
            if out_format == 'vcf':
                outfile.write_record(record)
            elif out_format == 'table':
                out_string = '\t'.join([
                    basename, record.CHROM,
                    str(record.POS), record.REF,
                    str(record.ALT),
                    str(list(record.gt_bases)),
                    str(list(record.gt_quals)),
                    str(list(record.gt_depths)) + '\n'
                ])
                f.write(out_string)

    if out_format == 'table':
        f.close()

    print(f'[saltMA] completed search for {os.path.basename(vcf)}')
    print(f'[saltMA] found {counter} matches over {total_count} sites.')
示例#5
0
def vcfprep(args):
    """
    Iterate through parental VCF and remove records by given filters. 
    
    All records where parent1 allele is the same as parent2
    ('uninformative sites') are automatically removed.

    Raises ValueError if >2 samples in VCF (should just be 2 parents)

    Parameters
    ----------
    args : Namespace
        Namespace containing all user given arguements compiled by arg_parse()

    Returns
    -------
    total_count : int
        total records considered
    kept_count : int
        number of records that passed filters
    """
    vcf_in = VCF(args.vcf)

    if len(vcf_in.samples) > 2:
        raise ValueError('more than 2 parental samples in input VCF')
    if args.snps_only and args.indels_only:
        raise ValueError('both --snps_only and --indels_only provided. pick one or neither!')

    if args.out.endswith('.gz'):
        outfile = args.out.replace('.gz', '')
    vcf_out = Writer(outfile, vcf_in)
    vcf_out.write_header()

    total_count, kept_count = 0, 0

    for record in tqdm(vcf_in):
        total_count += 1

        if not len(record.ALT) > 0:
            continue

        # SNP filter
        if args.snps_only and not record.is_snp:
            continue

        # indel filter
        if args.indels_only and not record.is_indel:
            continue

        # heterozygote call filter
        if args.no_hets and record.num_het != 0:
            continue

        # genotype quality filter
        if not all(record.gt_quals >= args.min_GQ):
            continue

        # ensure parental alleles differ
        if record.gt_bases[0] == record.gt_bases[1]:
            continue

        # only passes if record not caught in above filters
        vcf_out.write_record(record)
        kept_count += 1

    return total_count, kept_count
示例#6
0
def process_file(data: VCF, groups: list, f: int, fileout: list) -> None:
    #TODO: clean/refactor execution comments like processed file name
    #TODO: refactor processing lis tof files into single file processing + remove MSS param
    # data: VCF, groups: list, simul: str, fileout: str
    """
    Computes and rewrites genotypes of all individuals for all samples from input files
    :param data: cyvcf2 object reader pointing on a VCF-file
    :param groups: samples identifiers split in pools
    :param f: integer, index of the file to process in the list
    :param fileout: VCF-files with simulated pooled or randomly missing genotypes
    """
    print('Simulation type: ', 'simul')
    print('file out: ', os.path.join(os.getcwd(), fileout[f]))  # prm.PATH_OUT[simul]
    if prm.GTGL == 'GL' and prm.unknown_gl == 'adaptative':
        dic_header = {'ID': 'GL',
                      'Number': 'G',
                      'Type': 'Float',
                      'Description': 'three log10-scaled likelihoods for RR,RA,AA genotypes'}
        data.add_format_to_header(dic_header)
        whead = Writer(fileout[f], data)
        #TODO: whead = Writer(prm.PATH_OUT[simul], data)
        whead.write_header()
        whead.close()
        w = open(fileout[f], 'ab')
        #TODO:  w = open(prm.PATH_OUT[simul], 'ab')
        # Load adaptive GL values for missing data
        df = pd.read_csv(os.path.join(prm.WD, 'adaptive_gls.csv'),
                         header=None,
                         names=['rowsrr', 'rowsra', 'rowsaa', 'colsrr', 'colsra', 'colsaa',
                                'n', 'm',
                                'rr', 'ra', 'aa']
                         )
        df2dict = dict(((int(rwrr), int(rwra), int(rwaa), int(clrr), int(clra), int(claa),
                         int(n), int(m)),
                        [rr, ra, aa]) for rwrr, rwra, rwaa, clrr, clra, claa,
                                          n, m,
                                          rr, ra, aa in df.itertuples(index=False, name=None))

        sig = allfqc.SigmoidInterpolator(os.path.join(prm.PATH_GT_FILES, prm.RAW['gz'].replace('gl', 'gt')),
                                         os.path.join(prm.PATH_GT_FILES, prm.POOLED['gz'].replace('gl', 'gt')))
        params = sig.get_sigmoid_params()
        interp = sig.interpolate_derivative()

    else:  # prm.GTGL == 'GT' or fixed GL
        w = Writer(fileout[f], data)
        #TODO: w = Writer(prm.PATH_OUT[simul], data)
        w.set_threads(4)
        df2dict = None
        sig = None
        params = None
        interp = None

    tm = time.time()
    # for n, variant in enumerate(data('20:59973567-59973568')):
    for n, variant in enumerate(data):
        process_line(groups, f, w, variant, df2dict, sig, params, interp)
        if n % 1000 == 0:
            print('{} variants processed in {:06.2f} sec'.format(n+1, time.time()-tm).ljust(80, '.'))
        # if n+1 == 1000:
        #     break
    w.close()

    # GL converted from GT, missing GLs will be filled with [0.33, 0.33, 0.33]
    if prm.GTGL == 'GL' and prm.unknown_gl != 'adaptative':
        alltls.file_likelihood_converter(os.path.join(prm.PATH_GT_FILES,
                                                      fileout[f].replace('.gl', '.gt')) + '.gz',  # prm.PATH_OUT[simul]
                                         fileout[f])  # prm.PATH_OUT[simul]