def snp_parser(in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples,filter_complex=True): print in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples import yaml import re open(outfile_parsed,'a') bases = [] vcf = yaml.load(open(in_yaml,'r').read()) samples = [re.sub('[\D]','',bam) for bam in open(bam_file_list,'r').read().split('\n') if len(re.sub('[\D]','',bam)) > 1] for key in vcf.keys(): # for all chromosomes (transcripts) linkage = get_linkage(key) if len(vcf[key]) > 1 and len(linkage.keys()) >= 1 and len(linkage.keys()[0]) > 1 and (sum([x == '1' for x in linkage.values()]) >= 0.9*len(linkage.values())): # if it has SNPs and 90% the SNPs are under linkage for SNP in [vcf[key][0].index(int(SNP)) for SNP in linkage.keys() if linkage[SNP] == '1']: if vcf[key][3][SNP][0] in range(int(N)/2-int(IC) ,int(N)/2+int(IC)) and sum([qual >= 30 for qual in quality_filter(key,vcf[key][0][SNP],vcf_files)]) >= 0.9*len(quality_filter(key,vcf[key][0][SNP],vcf_files)): if len(vcf[key][3][SNP]) == 1: # if SNP not multiallelic if filter_complex == True: # if you have not defined non-parsed outfile, then check that you have +coverage (default 5) in each sampe at alt OR ref allele and all variants are SNPs ''' coverage = position_coverage(key, vcf[key][0][SNP], samples, bam_file_list) # record the coverage of the position in all samples (in the directory) if sum([cov >= int(min_coverage) for cov in coverage.values()]) >= int(min_samples): # if the position is covered by min_coverage in (min_samples) number of samples (i.e. the gene is expressed in (min _samples) number of samples) ''' open(outfile_parsed,'a').write('%s\t%s\t%s\t%s\t%s\t%s\n' % (key, vcf[key][0][SNP], vcf[key][1][SNP], vcf[key][2][SNP][0], coverage, vcf[key][4][SNP])) else: print 'min coverage not reached for all files or the variants are complex!' pass else: pass else: pass
def snp_parser(in_yaml,outfile_parsed,N,IC,min_coverage,min_samples,vcf_files,bam_file_list,linkageFile='linkageValidation.txt'): ''' Linkage file determines whether all the SNPs that have been observedfor the gene are in linkage over all samples, indicated as 1 on the third field of each row (transcript). 0 means no linkage over samples, which might be caused by reads from another transcript mapping to the same GCAT gene model sequence. These should be evited. ''' print 'running with arguments ' + in_yaml,outfile_parsed,vcf_files,bam_file_list,N,IC,min_coverage,min_samples import yaml, re open(outfile_parsed,'a') bases = [] vcf = yaml.load(open(in_yaml,'r').read()) samples = [re.sub('[\D]','',bam) for bam in open(bam_file_list,'r').read().split('\n') if len(re.sub('[\D]','',bam)) > 1] a_os = dict([(s,0) for s in samples]) r_os = dict([(s,0) for s in samples]) print r_os print a_os for key in vcf.keys(): # for all chromosomes (transcripts) linkage = get_linkage(key,linkageFile) print key print linkage if len(vcf[key]) > 1 and linkage and len([x for x in linkage.keys() if linkage[x] == '1']) >= 2 and (sum([x == '1' for x in linkage.values()]) >= 0.9*len(linkage.values())): # if it has SNPs and 90% the SNPs are under linkage haplo = haplotyper(vcf, key, [x for x in linkage.keys() if linkage[x] == '1'], vcf_files, samples) # haplotype parsing function open('haplotype_lists.out','a').write(key+'\t'+str([vcf[key][0][x] for x in haplo[0]])+'\t'+str([vcf[key][0][x] for x in haplo[1]])+'\n') alt_samples = dict() ref_samples = dict() alt_coverage = dict() ref_coverage = dict() variant_types = dict() for SNP_index in haplo[0]: # for each SNP position in haplotype for the 1st positions that share the ALT SNP alt_samples[vcf[key][0][SNP_index]] = [re.sub('[\D]','',sample) for sample in vcf[key][4][SNP_index]] ref_samples[vcf[key][0][SNP_index]] = [s for s in samples if s not in alt_samples[vcf[key][0][SNP_index]]] alt_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], alt_samples[vcf[key][0][SNP_index]], bam_file_list) # record the coverage of the position in all alt samples ref_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], ref_samples[vcf[key][0][SNP_index]], bam_file_list) # coverage all ref samples for s in alt_samples[vcf[key][0][SNP_index]]: a_os[s] += alt_coverage[vcf[key][0][SNP_index]][s] for s in ref_samples[vcf[key][0][SNP_index]]: r_os[s] += ref_coverage[vcf[key][0][SNP_index]][s] assert all([s in alt_coverage.keys() for s in alt_samples]) assert all([s in ref_coverage.keys() for s in ref_samples]) for SNP_index in haplo[1]: ''' for each SNP position in haplotype for the 2st positions that share the ALT SNP the samples in the ALT field of the vcf dictionary have to be counted as the the reference to have the coverage of the two maternal haplotypes the REF SNP is linked to the ALT SNPs of the positions in the 1st list of "haplo" --> see haplotyper function ''' ref_samples[vcf[key][0][SNP_index]] = [re.sub('[\D]','',sample) for sample in vcf[key][4][SNP_index]] # these samples actually have the ALT nucleotide relative to the reference! alt_samples[vcf[key][0][SNP_index]] = [s for s in samples if s not in ref_samples[vcf[key][0][SNP_index]]] ref_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], ref_samples[vcf[key][0][SNP_index]], bam_file_list) # record the coverage of the position in all alt samples alt_coverage[vcf[key][0][SNP_index]] = position_coverage(key, vcf[key][0][SNP_index], alt_samples[vcf[key][0][SNP_index]], bam_file_list) # coverage all ref samples for s in ref_samples[vcf[key][0][SNP_index]]: a_os[s] += ref_coverage[vcf[key][0][SNP_index]][s] # the "REF" reads cover actually ALT nucleotides, and need to be counted to the alternative observations... for s in alt_samples[vcf[key][0][SNP_index]]: r_os[s] += alt_coverage[vcf[key][0][SNP_index]][s] # and vice versa! assert all([s in alt_coverage.keys() for s in alt_samples]) assert all([s in ref_coverage.keys() for s in ref_samples]) if len(haplo[0]) > 0 or len(haplo[1]) > 0 and [sum([x >= int(min_coverage) for x in alt_coverage[SNP_index].values()]) for SNP_index in alt_coverage.keys()] + [sum([x >= int(min_coverage) for x in ref_coverage[SNP_index].values()]) for SNP_index in ref_coverage.keys()] >= int(min_samples): open(outfile_parsed,'a').write('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (key, [len(vcf[key][4][SNP]) for SNP in haplo[0] | haplo[1]], [vcf[key][0][SNP] for SNP in haplo[0] | haplo[1]], [vcf[key][2][SNP] for SNP in haplo[0] | haplo[1]], [alt_coverage[vcf[key][0][SNP]] for SNP in haplo[0] | haplo[1]], [ref_coverage[vcf[key][0][SNP]] for SNP in haplo[0] | haplo[1]], alt_samples, ref_samples)) print 'RUN COMPLETED!!!' else: pass else: pass for s in samples: open('alt_observations.haplotypes','a').write(str(s) + ' ' + str(a_os[s]) + '\n') open('ref_observations.haplotypes','a').write(str(s) + ' ' + str(r_os[s]) + '\n')