def generate_sample_vcf(filename='/Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz'): """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing""" vcf = VCF(filename) write = Writer('samp_build38.vcf', vcf) write.write_header() for chrom_num, chrom_len in REF_GENOME: begin = random.randint(1000, chrom_len - 1000) os.system(append_variants_to_vcf(chrom_num, begin, begin + 1000)) write.close()
def generate_sample_vcf(vcf_path, outfile='samp_build38.vcf'): """Takes a large VCF file and takes random samples from each chromosome to make a smaller VCF for testing""" vcf = VCF(vcf_path) write = Writer(outfile, vcf) write.write_header() key_values = zip(vcf.seqnames, vcf.seqlens) chrom_keys = defaultdict(int) chroms = get_autosome_names_grch38() for kv in list(key_values): if kv[0] in chroms: chrom_keys[kv[0]] = kv[1] for chrom_num, chrom_len in chrom_keys.items(): begin = random.randint(100000, chrom_len - 100000) os.system( append_variants_to_vcf(vcf_path, chrom_num, begin, begin + 10_000, outfile=outfile)) write.close()
from os import system import random from pyfaidx import Fasta from cyvcf2 import VCF, Writer def gen_com(chrom, start, stop): return "tabix /Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz " + str( chrom) + ":" + str(start) + "-" + str(stop) + " >> samp.vcf" fa = Fasta('/Users/simonelongo/too_big_for_icloud/REFERENCE_GENOME_GRch37.fa') vcf = VCF( '/Users/simonelongo/too_big_for_icloud/gnomad.genomes.r2.1.1.sites.vcf.bgz' ) write = Writer('samp.vcf', vcf) write.write_header() keys = list(fa.keys()) for key in keys[0:25]: if key != 'MT': begin = random.randint(1000, len(fa[key]) - 1000) system(gen_com(key, begin, begin + 1000)) write.close()
def parse_records(vcf, gq, out_format, vcf_type, verbose_level, purity_filter, single_sample, out): ''' Iterates through VCF and writes records passing above filters to file. Parameters ------- vcf : str File path to input VCF gq : int GQ threshold (records >gq kept) verbose_level : int How much to print to console. If 0 - print no progress info besides tqdm bar If 1 - print counter at each chromosome completion If 2 - print all candidate information as they are found vcf_type : str [combined|pairs] whether the VCF just contains the 0/5 pairs or other samples as well out_format : str [table|vcf] - whether to write as new VCF or as a tab-separated file purity_filter : bool if enabled, will filter out records where mutated site has >2 reads of non mut allele (see check_record) single_sample : bool if enabled, will assume there is only one sample of interest in the VCF (e.g. just one of 0 and 5) - also assumes 'combined' mode out : str Name of file to write to Returns ------- None Writes to specified file. ''' print(f'the verbose level is {verbose_level}') vcf_in = VCF(vcf) sample_names = vcf_in.samples pair_sample_names = sorted([ item for item in sample_names if item.endswith('_0') or item.endswith('_5') ]) if single_sample: try: assert len(pair_sample_names) == 1 except AssertionError as e: print(f'[saltMA] ERROR: samples bonked - {pair_sample_names}') print('[saltMA] Exiting...') sys.exit() # get samples try: if not single_sample: sample_lookup = sample_names.index(pair_sample_names[0]), \ sample_names.index(pair_sample_names[1]) elif single_sample: shortlist = [ s for s in sample_names if single_sample in s and s != pair_sample_names[0] ] sample_lookup = sample_names.index(shortlist[0]), \ sample_names.index(pair_sample_names[0]) print( f'[saltMA] selected samples are {pair_sample_names[0]} and {shortlist[0]}' ) except IndexError as e: print('[saltMA] ERROR: Samples seem incorrect. ' 'Ensure you have a 0 and 5 sample in the VCF, ' 'unless --single_sample has been selected.') print('[saltMA] Exiting...') sys.exit() print(f'[saltMA] initiating filtering for {os.path.basename(vcf)}...') counter = 0 total_count = 0 doublemut_count = 0 prev_chr = None if out_format == 'vcf': outfile = Writer(out, vcf_in) outfile.write_header() elif out_format == 'table': f = open(out, 'w') header_string = '\t'.join([ 'fname', 'chrom', 'pos', 'ref', 'alt', 'gt_bases', 'gt_quals', 'gt_depths' ]) f.write(header_string + '\n') basename = os.path.basename(vcf).replace('.vcf.gz', '') for record in tqdm(vcf_in): total_count += 1 check = check_record(record, vcf_type, sample_lookup, gq=gq, purity_filter=purity_filter, single_sample=single_sample) if check == 'doublemut': if verbose_level == 2: tqdm.write( f'[saltMA] doublemut at {record.CHROM}:{record.POS}') doublemut_count += 1 continue else: doublemut_count += 1 continue elif check: counter += 1 if verbose_level == 1: if not prev_chr: prev_chr = record.CHROM tqdm.write( f'[saltMA] first chrom with detected mut is {prev_chr}' ) continue elif prev_chr != record.CHROM: tqdm.write(f'[saltMA] {prev_chr} completed.') tqdm.write(f'[saltMA] current count is {counter}') prev_chr = record.CHROM elif verbose_level == 2: tqdm.write( f'[saltMA] candidate mut found at {record.__repr__()}') tqdm.write(f'[saltMA] current count is {counter}') tqdm.write(f'[saltMA] doublemut count is {doublemut_count}') if out_format == 'vcf': outfile.write_record(record) elif out_format == 'table': out_string = '\t'.join([ basename, record.CHROM, str(record.POS), record.REF, str(record.ALT), str(list(record.gt_bases)), str(list(record.gt_quals)), str(list(record.gt_depths)) + '\n' ]) f.write(out_string) if out_format == 'table': f.close() print(f'[saltMA] completed search for {os.path.basename(vcf)}') print(f'[saltMA] found {counter} matches over {total_count} sites.')
def vcfprep(args): """ Iterate through parental VCF and remove records by given filters. All records where parent1 allele is the same as parent2 ('uninformative sites') are automatically removed. Raises ValueError if >2 samples in VCF (should just be 2 parents) Parameters ---------- args : Namespace Namespace containing all user given arguements compiled by arg_parse() Returns ------- total_count : int total records considered kept_count : int number of records that passed filters """ vcf_in = VCF(args.vcf) if len(vcf_in.samples) > 2: raise ValueError('more than 2 parental samples in input VCF') if args.snps_only and args.indels_only: raise ValueError('both --snps_only and --indels_only provided. pick one or neither!') if args.out.endswith('.gz'): outfile = args.out.replace('.gz', '') vcf_out = Writer(outfile, vcf_in) vcf_out.write_header() total_count, kept_count = 0, 0 for record in tqdm(vcf_in): total_count += 1 if not len(record.ALT) > 0: continue # SNP filter if args.snps_only and not record.is_snp: continue # indel filter if args.indels_only and not record.is_indel: continue # heterozygote call filter if args.no_hets and record.num_het != 0: continue # genotype quality filter if not all(record.gt_quals >= args.min_GQ): continue # ensure parental alleles differ if record.gt_bases[0] == record.gt_bases[1]: continue # only passes if record not caught in above filters vcf_out.write_record(record) kept_count += 1 return total_count, kept_count
def process_file(data: VCF, groups: list, f: int, fileout: list) -> None: #TODO: clean/refactor execution comments like processed file name #TODO: refactor processing lis tof files into single file processing + remove MSS param # data: VCF, groups: list, simul: str, fileout: str """ Computes and rewrites genotypes of all individuals for all samples from input files :param data: cyvcf2 object reader pointing on a VCF-file :param groups: samples identifiers split in pools :param f: integer, index of the file to process in the list :param fileout: VCF-files with simulated pooled or randomly missing genotypes """ print('Simulation type: ', 'simul') print('file out: ', os.path.join(os.getcwd(), fileout[f])) # prm.PATH_OUT[simul] if prm.GTGL == 'GL' and prm.unknown_gl == 'adaptative': dic_header = {'ID': 'GL', 'Number': 'G', 'Type': 'Float', 'Description': 'three log10-scaled likelihoods for RR,RA,AA genotypes'} data.add_format_to_header(dic_header) whead = Writer(fileout[f], data) #TODO: whead = Writer(prm.PATH_OUT[simul], data) whead.write_header() whead.close() w = open(fileout[f], 'ab') #TODO: w = open(prm.PATH_OUT[simul], 'ab') # Load adaptive GL values for missing data df = pd.read_csv(os.path.join(prm.WD, 'adaptive_gls.csv'), header=None, names=['rowsrr', 'rowsra', 'rowsaa', 'colsrr', 'colsra', 'colsaa', 'n', 'm', 'rr', 'ra', 'aa'] ) df2dict = dict(((int(rwrr), int(rwra), int(rwaa), int(clrr), int(clra), int(claa), int(n), int(m)), [rr, ra, aa]) for rwrr, rwra, rwaa, clrr, clra, claa, n, m, rr, ra, aa in df.itertuples(index=False, name=None)) sig = allfqc.SigmoidInterpolator(os.path.join(prm.PATH_GT_FILES, prm.RAW['gz'].replace('gl', 'gt')), os.path.join(prm.PATH_GT_FILES, prm.POOLED['gz'].replace('gl', 'gt'))) params = sig.get_sigmoid_params() interp = sig.interpolate_derivative() else: # prm.GTGL == 'GT' or fixed GL w = Writer(fileout[f], data) #TODO: w = Writer(prm.PATH_OUT[simul], data) w.set_threads(4) df2dict = None sig = None params = None interp = None tm = time.time() # for n, variant in enumerate(data('20:59973567-59973568')): for n, variant in enumerate(data): process_line(groups, f, w, variant, df2dict, sig, params, interp) if n % 1000 == 0: print('{} variants processed in {:06.2f} sec'.format(n+1, time.time()-tm).ljust(80, '.')) # if n+1 == 1000: # break w.close() # GL converted from GT, missing GLs will be filled with [0.33, 0.33, 0.33] if prm.GTGL == 'GL' and prm.unknown_gl != 'adaptative': alltls.file_likelihood_converter(os.path.join(prm.PATH_GT_FILES, fileout[f].replace('.gl', '.gt')) + '.gz', # prm.PATH_OUT[simul] fileout[f]) # prm.PATH_OUT[simul]