def get_variant_summary_probs(self, rsid, threshold): variant_array = [] msg = "" docs = self.var_coll.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx) doc['selected'] = 1 doc['a_af'] = alleleAf doc['b_af'] = alleleBf doc['hwe_p'] = p_hwe doc['Missing'] = 0 if 'Missing' in gc_count_dict: doc['Missing'] = gc_count_dict['Missing'] variant_array.append(doc) if len(variant_array) == 0: msg = "Variant NOT FOUND - %s, " % (rsid) return (variant_array, msg)
def check_concordancies(self, data_list, assays, chipval): hwe_values = [0.0] * len(data_list) maf_values = [0.0] * len(data_list) obs = [0.0] * 3 exp = [0.0] * 3 allele_ref_1 = "" allele_alt_1 = "" allele_ref_2 = "" allele_alt_2 = "" #print "CHECK_CONC:", len(data_list) for i, vcf_record in enumerate(data_list): if len(vcf_record) > 0: vcfr = VCFrecord(vcf_record) probidx = vcfr.get_probidx() homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array( data) allele_a, allele_b = vcfr.get_alleles() if allele_ref_1 == "": allele_ref_1 = allele_a allele_alt_1 = allele_b # Add 1 to prevent 0-divide obs[0] = homref_count + 1 obs[1] = het_count + 1 obs[2] = homalt_count + 1 else: allele_ref_2 = allele_a allele_alt_2 = allele_b exp[0] = homref_count + 1 exp[1] = het_count + 1 exp[2] = homalt_count + 1 if (allele_ref_1 != allele_ref_2) or (allele_alt_1 != allele_alt_2): varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) self.allele_discord_count += 1 logging.info( "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s", assays[0], assays[i], varid, int(posn), allele_ref_1, allele_alt_1, allele_ref_2, allele_alt_2) #print "Allele discord" return False chi_stat, chi_p_value = chisquare(obs, f_exp=exp) varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) if chi_p_value < chipval: self.chisq_count += 1 logging.info( "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, chipval, str(obs), str(exp), i) #print "CHISQ discord" return False logging.info( "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, str(obs), str(exp), i) return True
def get_combined_array(self, buffer_list, cr_list, assay_list, threshold=0.9): """ For each list of data, for each element of list of data: 1) Find the col header from the corresonding file_position element 2) Use the col_header to find the combined postion 3) Place the data_element in the combined postion * TODO - conflict resolution, what to do if a slot is already occupied TODO - CR check """ #print "COMBO", self.combined_positions # #print "ASSAY_LIST: %s" % (str(assay_list)) assay_posns = {} for i, assaytype in enumerate(assay_list): assay_posns[i] = assaytype #print "ASSAY_POSNS: %s" % (str(assay_posns)) combo_array = ["."] * len(self.combined_positions) #print "BUFFL", len(buffer_list) for i, vcf_record in enumerate(buffer_list): if len(vcf_record) > 0: #print "asstp: %d, %s" % (i, assay_list[i]) vcfr = VCFrecord(vcf_record) prfx, data_list = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() rsid = vcfr.get_varid() hasAT = vcfr.has_fmt("AT") for j, dataelem in enumerate(data_list): if data_list[j] != ".": cpos = self.combined_positions[self.file_positions[i] [j]] geno = self.call_geno_for_threshold( data_list[j], probidx, threshold) if (hasAT == False): geno = geno + ":" + self.assay_abbrev[ assay_list[i]] if combo_array[cpos] != ".": self.geno_overlap_count += 1 #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno) geno = self.call_genotype(combo_array[cpos], geno, probidx) combo_array[cpos] = geno return combo_array
def get_rslist_data(self, input_rslist, threshold, download_list): msg = None snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n" data = [] assaytypelist = [] probidxlist = [] rslist = [] assaytypes = {} Afreq = {} Bfreq = {} data_count = 0 impDict = {} for rsid in input_rslist: docs = self.var_coll.get_variant_data_multi(rsid) if len(docs) > 0: rslist.append(rsid) # handling SNPs on multiple platforms for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) # first get filepath fpath = self.filepaths_coll.get_filepath( doc["assaytype"], chromosome) # get raw variant data fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) geno_count = 0 sample_count = 0 hwep = 0.0 vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs( sfx, threshold, probidx) Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf data_count += 1 hwep = float(p_hwe) assaytypelist.append(doc["assaytype"]) data.append(vcfr) if doc["assaytype"] not in assaytypes: assaytypes[doc["assaytype"]] = 1 imputed = 0 if "imputed" in doc: imputed = 1 if "info" in doc: if doc["info"] != 1.0: imputed = 1 impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % ( doc["rsid"], doc["assaytype"], doc["chromosome"], doc["position"], doc["alleleA"], alleleAf, doc["alleleB"], alleleBf, maf, imputed, float(geno_count) / sample_count, hwep, doc["info"]) pdata = self.get_sample_values(assaytypelist, data, data_count, rslist, impDict, assaytypes, Afreq, Bfreq, threshold) return (pdata, snpdata, msg)
def main(options): hdrData = ["id"] sampleDict = {} colPosns = {} RefAlleleDict = {} AltAlleleDict = {} count = 0 mafh = Mafhelper() for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() #print prfx if (line.startswith('#')): # Parse out the header record. for i, col_hdr in enumerate(sfx): colPosns[i] = col_hdr sampleDict[col_hdr] = [] else: flip = False varid = vcfr.get_varid_ukb() #logging.info("varid=%s", varid) ref, alt = vcfr.get_alleles() probidx = vcfr.get_probidx() hdr_allele = alt homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts( ) call_count = homref_count + het_count + homalt_count maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, nc_count) RefAlleleDict[varid] = ref AltAlleleDict[varid] = alt #if ma == ref: # flip = True # hdr_allele = ref # logging.info("FLIP for %s, %s, %s", varid, ref, alt) hdrData.append(varid) for i, str_geno in enumerate(sfx): if str_geno != ".": geno = str_geno.split(":") max_prob, max_idx = get_max_prob(geno, probidx) i_call = icalls[geno[0]] if flip == True: if i_call == "0": i_call == "2" elif i_call == "2": i_call = "0" sampleDict[colPosns[i]].append(str(i_call)) else: sampleDict[colPosns[i]].append("") print ",".join(hdrData) for samp in sampleDict: count += 1 print ",".join([samp] + sampleDict[samp]) return count