def processPhasedData (vcf_filename, chromosome): [hit_snp_positions, hit_snps] = readHitSNPs(chromosome) genes = hg19.read(chromosome) indices_children = vcf.returnColumnsOfChildren(vcf_filename) deletions = vcf.returnDeletions(chromosome, indices_children, MAJOR_AF_THRESHOLD) vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header output = [] # output is stored here for variant in vcf_file.readlines(): data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP # then determine whether this is a hit snp: snp_pos = int(data[1]) #if snp_pos in hit_snp_positions: # hit snp # [phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_children) # if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD: # snp_type = hg19.determineType(snp_pos, genes) # dist_tss = hg19.distanceToTSS(snp_pos, genes) # [max_r, p, d] = maximumR_hitsnp (snp_pos, phase, deletions) # #print '1', max_r, p # if d is not None: # dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) # output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss]) #else: # non-hit snp phase = vcf.returnPhase(data, indices_children) snp_af = vcf.determineAlleleFrequency (phase) if snp_af <= MAJOR_AF_THRESHOLD and 1 - snp_af <= MAJOR_AF_THRESHOLD: snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) [max_r, p, d] = maximumR (snp_pos, phase, deletions) type_snp = 0 if snp_pos in hit_snp_positions: type_snp = 1 #print '0', max_r, p if d is not None: dist_snp_del = distanceSNP_CNV(snp_pos, d['pos'], d['length']) output.append([chromosome, type_snp, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss]) writeToOutputFile(output, chromosome) vcf_file.close()
def preprocessData (vcf_filename, chromosome, individuals=('parents', 'children')): [hit_snps_positions, hit_snps, discarded_snps_positions, genes, indices_individuals, deletions] = gatherData(vcf_filename, chromosome, individuals) vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header output = [] # results are stored here and later written to file n_hit_snps, n_discarded_snps, n_nonhit_snps = 0, 0, 0 for variant in vcf_file.readlines (): # move through the phased variants in this file data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP and no deletion snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) if snp_pos in hit_snps_positions: # SNP is a hit SNP [phase, snp_af] = processHitSNP (snp_pos, data, hit_snps, indices_individuals) [max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions) if d is not None: [g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af']) [a,b,c,e] = vcf.return2x2Table(phase, d['phase']) n_hit_snps = n_hit_snps + 1 output.append([chromosome, 1, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e]) else: n_discarded_snps = n_discarded_snps + 1 discarded_snps_positions.append(snp_pos) elif snp_pos not in discarded_snps_positions: # non-hit SNP phase = vcf.returnPhase(data, indices_individuals) snp_af = vcf.determineAlleleFrequency (phase) [max_r, p, d, dist_snp_del, min_dist_snp_del] = returnSnpDeletionPair (snp_pos, phase, deletions) if d is not None: [g,g_n] = gamma.returnGamma(phase, d['phase'], snp_af, d['af']) [a,b,c,e] = vcf.return2x2Table(phase, d['phase']) n_nonhit_snps = n_nonhit_snps + 1 output.append([chromosome, 0, snp_pos, snp_type, snp_af, d['pos'], d['length'], d['af'], max_r, p, dist_snp_del, dist_tss, min_dist_snp_del, g, g_n, a, b, c, e]) else: n_discarded_snps = n_discarded_snps + 1 discarded_snps_positions.append(snp_pos) writeToOutputFile(output, chromosome) vcf_file.close() print '# hit SNPs: ', n_hit_snps, '\t#discarded SNPs:', n_discarded_snps, '\t# non hit SNPs: ', n_nonhit_snps
def processSNPs (vcf_filename, chromosome, individuals=('parents', 'children')): # gather the required data [hit_snps_positions, hit_snps, discarded_snps_positions] = pr.readHitSNPs (chromosome) genes = hg19.read (chromosome) indices_individuals = vcf.returnColumns (vcf_filename, individuals) deletions = vcf.returnDeletions (chromosome, indices_individuals, MAJOR_AF_THRESHOLD) # open file vcf_file = vcf.openVCFFile(vcf_filename) vcf.discardVCFHeaders(vcf_file) vcf_file.readline() # discard the column description header for variant in vcf_file.readlines(): # move through the genetic variants snp_output = [] data = [value for value in variant.split()] if len(data[3]) == 1: # variant is a SNP and no deletion snp_pos = int(data[1]) snp_type = hg19.determineType(snp_pos, genes) dist_tss = hg19.distanceToTSS(snp_pos, genes) if snp_pos in hit_snps_positions: # SNP is a hit SNP [phase, snp_af] = pr.processHitSNP (snp_pos, data, hit_snps, indices_individuals) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['HITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) elif snp_pos not in discarded_snps_positions: phase = vcf.returnPhase(data, indices_individuals) snp_af = vcf.determineAlleleFrequency (phase) if 1 - MAJOR_AF_THRESHOLD <= snp_af <= MAJOR_AF_THRESHOLD: snp_output.append(['NONHITSNP', snp_pos, snp_af, snp_type, dist_tss]) dels = returnDeletions(snp_pos, snp_af, phase, deletions) # TODO implement if len(dels) > 0: snp_output.append(dels) if len(snp_output) > 0: writeSNPToOutputFile(snp_output, chromosome) vcf_file.close()