def main(options): try: fh = open(options.convfile, "r") smap = load_sample_map(fh) except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] hdrlen = 0 count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): print line else: if (line.startswith('#')): vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() for idx, elem in enumerate(sfx): sfx[idx] = smap[elem] print "\t".join(prfx) + "\t" + "\t".join(sfx) else: print line return count
def main(options): try: godb = GoDb() except: print "Unexpected error:", sys.exc_info()[0] exit() hdr = [] count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: if (line.startswith('#')): vcfr = VCFrecord(line) prf, sfx = vcfr.get_prfx_sfx() for idx, field in enumerate(sfx): count += 1 godb.process_sample_detail(field, idx, options.assaytype) if (godb.get_samples_len() > flush_at): godb.flush_sample_buff() break godb.flush_sample_buff() print "" return count
def get_variant_summary_probs(self, rsid, threshold): variant_array = [] msg = "" docs = self.var_coll.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx) doc['selected'] = 1 doc['a_af'] = alleleAf doc['b_af'] = alleleBf doc['hwe_p'] = p_hwe doc['Missing'] = 0 if 'Missing' in gc_count_dict: doc['Missing'] = gc_count_dict['Missing'] variant_array.append(doc) if len(variant_array) == 0: msg = "Variant NOT FOUND - %s, " % (rsid) return (variant_array, msg)
def get_dbsnp_rsid(dbsnpfile, chrom, posn): dbsnprec = dbsnpfile.get_dbsnp_file_record(options.dbsnpfile, chrom, int(posn)) rsid = "" refallele = "" if dbsnprec != None: dbvcf = VCFrecord(dbsnprec) rsid = dbvcf.get_varid() refallele, altallele = dbvcf.get_alleles() return rsid, refallele
def __init__(self, db, filedata_coll, sample_coll, gwasdb, probidx=1): self.db = db self.gwasdb = gwasdb self.markers = db.markers self.calls = ["0/0", "0/1", "1/1", "Missing"] self.icalls = [0, 1, 2, -9] self.filedata_coll = filedata_coll self.sample_coll = sample_coll self.probidx = probidx self.vcfr = VCFrecord()
def process_variant_detail_vcf(self, record, assaytype): """Process info file variant detail records Set up a json-stype document and add it to the variant buffer """ doc = {} doc["assaytype"] = assaytype vcfr = VCFrecord(record) prfx, sfx = vcfr.get_prfx_sfx() doc["rsid"] = vcfr.get_varid() # always store chromosome as a 2-digit string doc["chromosome"] = "%.2d" % (int(vcfr.get_chr())) alleleA, alleleB = vcfr.get_alleles() doc["alleleA"] = alleleA doc["alleleB"] = alleleB doc["position"] = vcfr.get_posn_as_int() try: doc["ref_maf"] = float(vcfr.get_info_value("RefPanelAF")) except: pass try: doc["info"] = float(vcfr.get_info_value("INFO")) except: doc["info"] = 1.0 self.variantbuff.append(doc)
def check_concordancies(self, data_list, assays, chipval): hwe_values = [0.0] * len(data_list) maf_values = [0.0] * len(data_list) obs = [0.0] * 3 exp = [0.0] * 3 allele_ref_1 = "" allele_alt_1 = "" allele_ref_2 = "" allele_alt_2 = "" #print "CHECK_CONC:", len(data_list) for i, vcf_record in enumerate(data_list): if len(vcf_record) > 0: vcfr = VCFrecord(vcf_record) probidx = vcfr.get_probidx() homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array( data) allele_a, allele_b = vcfr.get_alleles() if allele_ref_1 == "": allele_ref_1 = allele_a allele_alt_1 = allele_b # Add 1 to prevent 0-divide obs[0] = homref_count + 1 obs[1] = het_count + 1 obs[2] = homalt_count + 1 else: allele_ref_2 = allele_a allele_alt_2 = allele_b exp[0] = homref_count + 1 exp[1] = het_count + 1 exp[2] = homalt_count + 1 if (allele_ref_1 != allele_ref_2) or (allele_alt_1 != allele_alt_2): varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) self.allele_discord_count += 1 logging.info( "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s", assays[0], assays[i], varid, int(posn), allele_ref_1, allele_alt_1, allele_ref_2, allele_alt_2) #print "Allele discord" return False chi_stat, chi_p_value = chisquare(obs, f_exp=exp) varid = vcfr.get_varid(data) posn = vcfr.get_posn(data) if chi_p_value < chipval: self.chisq_count += 1 logging.info( "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, chipval, str(obs), str(exp), i) #print "CHISQ discord" return False logging.info( "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d", assays[0], assays[i], varid, int(posn), chi_stat, chi_p_value, str(obs), str(exp), i) return True
def main(): count = 0 for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: if (line.startswith('#')): vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() for samp in sfx: print samp break return count
def main(options): hdr = [] hdrlen = 0 count = 0 try: fh = open(options.chrommap) chrom_map = load_chrom_map(fh) except: print "Unable to open", options.chrommap exit() for line in sys.stdin: count += 1 line = line.strip() if (line.startswith('#')): print line else: vcfr = VCFrecord(line) strchrom = vcfr.get_chr() try: vcfr.set_chr(chrom_map[strchrom]) except: logging.info("Chromosome not found in map %s, %s" % (options.chrommap, strchrom)) exit() print vcfr.get_record() return count
def main(options): hdr = [] hdrlen = 0 count = 0 try: fh = open(options.chrommap) chrom_map = load_chrom_map(fh) except: print "Unable to open", options.chrommap exit() dbsnpfile = Dbsnpfile() dbsnpfile.set_tabix_file(options.dbsnpfile) for line in sys.stdin: count += 1 line = line.strip() if (line.startswith('#')): print line else: vcfr = VCFrecord(line) posn = vcfr.get_posn_as_int() try: dbsnprecs = dbsnpfile.get_dbsnp_file_record( options.dbsnpfile, chrom_map[options.chrom], posn) except: print "Chromosome not found in map", options.chrom exit() if len(dbsnprecs) > 0: vcfr.set_varid(dbsnpfile.get_rsid(dbsnprecs[0])) else: logging.info("NOT FOUND for %s at %d" % (options.chrom, posn)) print vcfr.get_record() return count
def get_geno_data(self, rsid, sample_id, assaytype_list_posns): geno_values = {} docs = self.get_variant_data_multi(rsid) for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) fpath = self.filepaths_coll.get_filepath(doc["assaytype"], chromosome) fullrec = self.get_raw_variant_values(fpath, chromosome, doc['position']) if doc["assaytype"] in assaytype_list_posns: vcfr = VCFrecord(fullrec) prfx, genodata = vcfr.get_prfx_sfx() geno_values[sample_id + "_" + doc["assaytype"]] = genodata[ assaytype_list_posns[doc["assaytype"]]] return (geno_values)
def __init__(self, db, dbname, projpref="akh", get_anochi=False, probidx=1): #print "Anochi logical", get_anochi self.db = db self.dbname = dbname self.gwasdb = Gwasdb(db) self.filedata_coll = _filedata(db) self.sam_coll = _samples(db) self.mkr_coll = _markers(db, self.filedata_coll, self.sam_coll, self.gwasdb, probidx) self.prochi_coll = _prochi_map(db, projpref, get_anochi) self.marker_totals = [] self.sample_count = -1 self.call_rates = {} self.probidx = probidx self.vcfr = VCFrecord()
def main(options): included_assaytypes = { "affy": 1, "illumina": 1, "broad": 1, "metabo": 1, "exome": 1 } godb = GoDb() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} count = 0 # Step 1 - get the list of entries for each rsid - one per assaytype vardocs = godb.get_multiple_variants(options.rsid) sampposns = godb.get_sample_posns(options.sampleid) for doc in vardocs: filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) rec = godb.get_variant_file_data(filepath, doc["chromosome"], doc["position"]) vcfr = VCFrecord(rec) prfx, sfx = vcfr.get_prfx_sfx() if doc["assaytype"] in sampposns: print "%s,%s,%s,%d,%s" % ( options.rsid, options.sampleid, doc["assaytype"], sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]]) return count
def get_next_records(self, key_list, prfx_list, recbuff_list): """ main rule is we read from the fh's corresponding to the min key list and replace the key_list, prfx and rec_buff elements accordingly. """ low_key_list, low_key_count = self.get_low_key_list(key_list) for i, fh in enumerate(self.fh_list): if low_key_list[i] != self.empty_key: line = fh.readline().strip() if line != "": # testing for EOF self.rec_counts[i] += 1 vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() maf, ma, cr = self.mafh.get_maf_and_cr(data, vcfr) prfx_list[i] = prfx recbuff_list[i] = sfx key_list[i] = int(prfx[1]) else: prfx_list[i] = [] recbuff_list[i] = [] key_list[i] = self.high_key #logging.info("rec_counts: %s, key_list: %s" % (str(self.rec_counts), str(key_list))) return key_list, prfx_list, recbuff_list
def get_combined_array(self, buffer_list, cr_list, assay_list, threshold=0.9): """ For each list of data, for each element of list of data: 1) Find the col header from the corresonding file_position element 2) Use the col_header to find the combined postion 3) Place the data_element in the combined postion * TODO - conflict resolution, what to do if a slot is already occupied TODO - CR check """ #print "COMBO", self.combined_positions # #print "ASSAY_LIST: %s" % (str(assay_list)) assay_posns = {} for i, assaytype in enumerate(assay_list): assay_posns[i] = assaytype #print "ASSAY_POSNS: %s" % (str(assay_posns)) combo_array = ["."] * len(self.combined_positions) #print "BUFFL", len(buffer_list) for i, vcf_record in enumerate(buffer_list): if len(vcf_record) > 0: #print "asstp: %d, %s" % (i, assay_list[i]) vcfr = VCFrecord(vcf_record) prfx, data_list = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() rsid = vcfr.get_varid() hasAT = vcfr.has_fmt("AT") for j, dataelem in enumerate(data_list): if data_list[j] != ".": cpos = self.combined_positions[self.file_positions[i] [j]] geno = self.call_geno_for_threshold( data_list[j], probidx, threshold) if (hasAT == False): geno = geno + ":" + self.assay_abbrev[ assay_list[i]] if combo_array[cpos] != ".": self.geno_overlap_count += 1 #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno) geno = self.call_genotype(combo_array[cpos], geno, probidx) combo_array[cpos] = geno return combo_array
def main(): mafh = Mafhelper() hweh = Hwehelper() in_count = 0 hdr_count = 0 homr_total = 0 het_total = 0 homa_total = 0 virt_nc_total = 0 miss_total = 0 print "SNPId,AssayType,chr,pos,REF,ALT,Minor,MAF,CallRate,HWE_pval" for line in sys.stdin: line = line.strip() in_count += 1 if line.startswith("#"): hdr_count += 1 continue vcfr = VCFrecord(line) varid = vcfr.get_varid_ukb() chromosome = vcfr.get_chr() posn = vcfr.get_posn_as_int() ref, alt = vcfr.get_alleles() homref_count, het_count, homalt_count, virt_nc_count, miss_count = vcfr.get_allele_counts() call_count = homref_count + het_count + homalt_count #nocall_count = virt_nc_count + miss_count nocall_count = virt_nc_count call_rate = float(call_count) / float(call_count + nocall_count) homr_total += homref_count het_total += het_count homa_total += homalt_count virt_nc_total += virt_nc_count miss_total += miss_count try: hwe = hweh.HWE_exact(het_count, homref_count, homalt_count, call_count) maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, virt_nc_count) except ZeroDivisionError: logging.info("DIV 0 error at %d (%d), where hom_r=%d, het=%d, home_a=%d, cc=%d", in_count, posn, homref_count, het_count, homalt_count, call_count) print "%s,combo,%s,%d,%s,%s,%s,%s,%.3f,%s" % (varid, chromosome, posn, ref, alt, ma, maf, call_rate, hwe) return in_count, hdr_count, homr_total, het_total, homa_total, virt_nc_total, miss_total
def main(options): #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1} #included_assaytypes = {"affy":1, "illumina":1} included_assaytypes = {"broad":1} #included_assaytypes = {"metabo":1} #included_assaytypes = {"affy":1} #included_assaytypes = {"bigtest":1} #included_assaytypes = {"biggertest":1} rsids = [] godb = GoDb() try: if options.snpfile != None: fh = open(options.snpfile, "r") rsids = load_snpfile_data(fh) else: rsids = options.rsids.split(",") except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit() except TypeError as e: print "Missing arguments ", e exit() except: logging.info("Unexpected error: %s", str(sys.exc_info())) sys.exit() # Step 0 - initialise db connection and instanciate helper objects mafh = Mafhelper() hweh = Hwehelper() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} hdr_pref = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] # Step 1 - get the list of entries for each rsid - one per assaytype for rsid in rsids: #logging.info("Processing rsid = %s", rsid) docs = godb.get_multiple_variants(rsid) if docs.count() > 0: rsid_assaytypes[rsid] = [] else: logging.info("RSID %s NOTFOUND", rsid) #print docs # Step 1a - collect assaytypes and marker documents # At this point we're establishing a list order which must be observed throughout. for doc in docs: #logging.info("%s", str(doc)) if doc["assaytype"] not in included_assaytypes: continue if doc["assaytype"] not in atype_list: atype_list.append(doc["assaytype"]) rsid_assaytypes[rsid].append(doc) logging.info(str(atype_list)) # Step 2 - collect lists of prochis (sample ids) by assaytype prochi_list = [[]] * len(atype_list) for i, atype in enumerate(atype_list): atype_posns[atype] = i prochi_list[i] = godb.get_samples(atype) #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i])) mm = Multibuffermerge(prochi_list) # Step 3 - get combined col_header positions # combo is a dict {posn:colname} combo = mm.get_combined_positions() #print len(combo) # combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact combocol = mm.get_combined_columns() # Step 4 - for each variant by rsid for rsid in rsid_assaytypes: if rsid not in rsid_dict: rsid_prfx_dict[rsid] = [[]] * len(atype_list) rsid_dict[rsid] = [[]] * len(atype_list) rsid_cr_dict[rsid] = [[]] * len(atype_list) rsid_info_dict[rsid] = [[]] * len(atype_list) #print len(rsid_assaytypes[rsid]) for doc in rsid_assaytypes[rsid]: if options.prfx != None: fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx) else: fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath) result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"]) if result != None: vcfr = VCFrecord(result) varid = vcfr.get_varid() if varid == rsid: rec = result maf, ma, cr = mafh.get_maf_and_cr(vcfr) # TODO - ALSO check maf, also apply QC filter at individual record level rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr)) #print combocol # Step 5 - execute the merge process print "\t".join(hdr_pref + combocol) count = 0 concordant = True for rsid in rsid_dict: if len(rsid_dict[rsid][0]) > 0: if options.check == 'Y': concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval) if concordant == True: comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list) vcfr = VCFrecord(rsid_dict[rsid][0]) prfx,sfx = vcfr.get_prfx_sfx() if len(prfx) > 0: logging.info("PRFX = %s, for %s", str(prfx), rsid) prfx[8] += ":AT" outrec = prfx + comborec print "\t".join(outrec) count += 1 else: logging.info("RSID %s NOTFOUND (2)", rsid) pass else: logging.info("Concordancy check fail for - %s" % (rsid)) #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts() #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count) chi_test_count, allele_disc_count, overlap_count = mm.get_counts() logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", chi_test_count, allele_disc_count, overlap_count) return count
def get_rslist_data(self, input_rslist, threshold, download_list): msg = None snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n" data = [] assaytypelist = [] probidxlist = [] rslist = [] assaytypes = {} Afreq = {} Bfreq = {} data_count = 0 impDict = {} for rsid in input_rslist: docs = self.var_coll.get_variant_data_multi(rsid) if len(docs) > 0: rslist.append(rsid) # handling SNPs on multiple platforms for doc in docs: # always force chromosome to 2 digits chromosome = "%.2d" % int(doc["chromosome"]) # first get filepath fpath = self.filepaths_coll.get_filepath( doc["assaytype"], chromosome) # get raw variant data fullrec = self.var_coll.get_raw_variant_values( fpath, chromosome, doc['position']) geno_count = 0 sample_count = 0 hwep = 0.0 vcfr = VCFrecord(fullrec) prfx, sfx = vcfr.get_prfx_sfx() probidx = vcfr.get_probidx() (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.var_coll.get_genotype_probs( sfx, threshold, probidx) Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf data_count += 1 hwep = float(p_hwe) assaytypelist.append(doc["assaytype"]) data.append(vcfr) if doc["assaytype"] not in assaytypes: assaytypes[doc["assaytype"]] = 1 imputed = 0 if "imputed" in doc: imputed = 1 if "info" in doc: if doc["info"] != 1.0: imputed = 1 impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % ( doc["rsid"], doc["assaytype"], doc["chromosome"], doc["position"], doc["alleleA"], alleleAf, doc["alleleB"], alleleBf, maf, imputed, float(geno_count) / sample_count, hwep, doc["info"]) pdata = self.get_sample_values(assaytypelist, data, data_count, rslist, impDict, assaytypes, Afreq, Bfreq, threshold) return (pdata, snpdata, msg)
def main(options): hdrData = ["id"] sampleDict = {} colPosns = {} RefAlleleDict = {} AltAlleleDict = {} count = 0 mafh = Mafhelper() for line in sys.stdin: line = line.strip() if (line.startswith('##')): pass else: vcfr = VCFrecord(line) prfx, sfx = vcfr.get_prfx_sfx() #print prfx if (line.startswith('#')): # Parse out the header record. for i, col_hdr in enumerate(sfx): colPosns[i] = col_hdr sampleDict[col_hdr] = [] else: flip = False varid = vcfr.get_varid_ukb() #logging.info("varid=%s", varid) ref, alt = vcfr.get_alleles() probidx = vcfr.get_probidx() hdr_allele = alt homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts( ) call_count = homref_count + het_count + homalt_count maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count, alt, nc_count) RefAlleleDict[varid] = ref AltAlleleDict[varid] = alt #if ma == ref: # flip = True # hdr_allele = ref # logging.info("FLIP for %s, %s, %s", varid, ref, alt) hdrData.append(varid) for i, str_geno in enumerate(sfx): if str_geno != ".": geno = str_geno.split(":") max_prob, max_idx = get_max_prob(geno, probidx) i_call = icalls[geno[0]] if flip == True: if i_call == "0": i_call == "2" elif i_call == "2": i_call = "0" sampleDict[colPosns[i]].append(str(i_call)) else: sampleDict[colPosns[i]].append("") print ",".join(hdrData) for samp in sampleDict: count += 1 print ",".join([samp] + sampleDict[samp]) return count
def main(options): #print options.file1 #print options.file2 try: fh1 = open(options.file1, "r") #fh2 = open(options.file2, "r") fh2 = sys.stdin except IOError as e: logging.info("I/O error({0}): {1}".format(e.errno, e.strerror)) exit() except TypeError as e: logging.info("Missing arguments " + e) exit() except: logging.info("Unexpected error:" + sys.exc_info()[0]) exit() vcff = VCFrecord() f1_hdr, f2_hdr = load_sample_positions(fh1, fh2, vcff) #print len(file1_positions) #print len(file2_positions) srtd_samples = sorted(combined_samples) #print len(srtd_samples) for i, sample in enumerate(srtd_samples): combined_positions[sample] = i output_combined_hdr(f1_hdr, f2_hdr, vcff) line1 = fh1.readline().strip() data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) fmts1 = vcff.get_fmts_from_array(data1) idxs1 = vcff.get_fmt_indices(fmts1, ["GT", "GP"]) line2 = fh2.readline().strip() data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) fmts2 = vcff.get_fmts_from_array(data1) idxs2 = vcff.get_fmt_indices(fmts2, ["GT", "GP"]) #print key1, key2 f1_count = 1 f2_count = 1 out_count = 0 discord_count = 0 while True: if (key1 == max_key and key2 == max_key): break if (key1 > key2): output_combined_record([], data2, vcffi, idxs1) out_count += 1 line2 = fh2.readline().strip() if line2 == "": key2 = max_key else: f2_count += 1 data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) elif (key2 > key1): output_combined_record(data1, [], vcff, idxs1) out_count += 1 line1 = fh1.readline().strip() if line1 == "": key1 = max_key else: f1_count += 1 data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) else: # On equality - check for allele concordance AlleleA1, AlleleB1 = vcff.get_alleles_from_array(data1) AlleleA2, AlleleB2 = vcff.get_alleles_from_array(data2) # TODO HWE concordance check - but how do we set thresholds? if (AlleleA1 == AlleleA2) and (AlleleB1 == AlleleB2): output_combined_record(data1, data2, vcff, idxs1, vcff.get_call_rate_from_array(data1), vcff.get_call_rate_from_array(data2)) out_count += 1 else: discord_count += 1 line1 = fh1.readline().strip() if line1 == "": key1 = max_key else: f1_count += 1 data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) line2 = fh2.readline().strip() if line2 == "": key2 = max_key else: f2_count += 1 data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) fh1.close() fh2.close() return f1_count, f2_count, out_count, discord_count
def main(options): #print options.file1 #print options.file2 try: fh1 = open(options.file1, "r") #fh2 = open(options.file2, "r") fh2 = sys.stdin except IOError as e: logging.info("I/O error({0}): {1}".format(e.errno, e.strerror)) exit() except TypeError as e: logging.info("Missing arguments " + e) exit() except: logging.info("Unexpected error:" + sys.exc_info()[0]) exit() vcff = VCFrecord() f1_hdr, f2_hdr = load_sample_positions(fh1, fh2, vcff) #print len(file1_positions) #print len(file2_positions) srtd_samples = sorted(combined_samples) #print len(srtd_samples) for i, sample in enumerate(srtd_samples): combined_positions[sample] = i output_combined_hdr(f1_hdr, f2_hdr, vcff) line1 = fh1.readline().strip() data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) fmts1 = vcff.get_fmts_from_array(data1) idxs1 = vcff.get_fmt_indices(fmts1, ["GT","GP"]) line2 = fh2.readline().strip() data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) fmts2 = vcff.get_fmts_from_array(data1) idxs2 = vcff.get_fmt_indices(fmts2, ["GT","GP"]) #print key1, key2 f1_count = 1 f2_count = 1 out_count = 0 discord_count = 0 while True: if (key1 == max_key and key2 == max_key): break if (key1 > key2): output_combined_record([], data2, vcffi, idxs1) out_count += 1 line2 = fh2.readline().strip() if line2 == "": key2 = max_key else: f2_count += 1 data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) elif (key2 > key1): output_combined_record(data1, [], vcff, idxs1) out_count += 1 line1 = fh1.readline().strip() if line1 == "": key1 = max_key else: f1_count += 1 data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) else: # On equality - check for allele concordance AlleleA1, AlleleB1 = vcff.get_alleles_from_array(data1) AlleleA2, AlleleB2 = vcff.get_alleles_from_array(data2) # TODO HWE concordance check - but how do we set thresholds? if (AlleleA1 == AlleleA2) and (AlleleB1 == AlleleB2): output_combined_record(data1, data2, vcff, idxs1, vcff.get_call_rate_from_array(data1), vcff.get_call_rate_from_array(data2)) out_count += 1 else: discord_count += 1 line1 = fh1.readline().strip() if line1 == "": key1 = max_key else: f1_count += 1 data1 = vcff.get_data_array(line1) key1 = vcff.get_posn_from_array_as_int(data1) line2 = fh2.readline().strip() if line2 == "": key2 = max_key else: f2_count += 1 data2 = vcff.get_data_array(line2) key2 = vcff.get_posn_from_array_as_int(data2) fh1.close() fh2.close() return f1_count, f2_count, out_count, discord_count
class DataStore(): def __init__(self, db, dbname, projpref="akh", get_anochi=False, probidx=1): #print "Anochi logical", get_anochi self.db = db self.dbname = dbname self.gwasdb = Gwasdb(db) self.filedata_coll = _filedata(db) self.sam_coll = _samples(db) self.mkr_coll = _markers(db, self.filedata_coll, self.sam_coll, self.gwasdb, probidx) self.prochi_coll = _prochi_map(db, projpref, get_anochi) self.marker_totals = [] self.sample_count = -1 self.call_rates = {} self.probidx = probidx self.vcfr = VCFrecord() def get_db_name(self): return self.dbname def get_probidx(self): return self.probidx def make_selection_key(self, varid, assaytype): return varid + "_" + assaytype def get_rsid_prochi_data(self, rsid, prochi, threshold, filefmt): """ Get data for the prochi, dict {platform:position_in_list} Get marker data for the rsid (up to num of platforms) """ assaytype_list_posns = {} sdocs = self.sam_coll.get_sampledata(prochi) for sdoc in sdocs: #print sdoc["assaytype"] assaytype_list_posns[sdoc["assaytype"]] = sdoc["list_posn"] genotypes = self.mkr_coll.get_geno_data(rsid, prochi, assaytype_list_posns) for genotype in genotypes: print genotype, genotypes[genotype] def get_marker_data_for_file(self, filepath, threshold): msg = None try: f = open(filepath, "r") except IOError as e: msg = filepath + ":" + e.strerror return ([], [], msg) count = 0 rslist = [] for line in f: count += 1 if count > 500: msg = "line count for %s gt the limit (%d)" % (filepath, 500) return ([], [], msg) line = line.strip() elems = line.split() rslist.append(elems[0]) f.close() marker_data = [] msg = "" for rsid in rslist: (marker_docs, tmsg) = self.get_marker_summary_probs(rsid, threshold) if (tmsg != ""): msg += tmsg for doc in marker_docs: marker_data.append(doc) return marker_data, msg def get_marker_data_by_range(self, chr, start, end, threshold=0.9): rslist = [] marker_data = [] msg = "" return (self.mkr_coll.get_marker_data_by_range(chr, start, end)) def get_range_data(self, chr, start, end, threshold, download_list): (docs, msg) = self.mkr_coll.get_marker_data_by_range(chr, start, end) if len(docs) == 0: return ([], [], msg) rsdict = {} rslist = [] for doc in docs: #print"RANGE rsid", doc["rsid"], doc["assaytype"], doc["position"] rsdict[doc["rsid"]] = 1 for rsid in rsdict: rslist.append(rsid) #return([], [], "") #print "Call get rslist data" return (self.get_rslist_data(rslist, threshold, download_list)) def build_csv_data(self, rslist, sampleDict, assaytypes, threshold): # NOTE: maintaining rslist order is vital! rsidx = {} normalised = False idx = 0 for rsid in rslist: rsidx[rsid] = idx idx += 1 assaytypes['combined'] = 1 by_platform_data = {} for assaytype in assaytypes: by_platform_data[assaytype] = [] hdrData = ["sampleId"] for rsid in rslist: #print "RSID", rsid hdrData.append(rsid) hdrData.append(rsid + "_c") hdrData.append(rsid + "_p") hdrData.append(rsid + "_alt") hdrString = ','.join(hdrData) for assaytype in assaytypes: by_platform_data[assaytype].append(hdrString) for samp in sampleDict: output_lines = {} filled_output_lines = {} filled_output_lines['combined'] = True for assaytype in assaytypes: output_lines[assaytype] = ["" for x in range(len(rslist) * 4)] for rsid in sampleDict[samp]: if len( sampleDict[samp][rsid] ) > 0: # if a sample wasn't genotyped on any platform there might not be data idxoffset = rsidx[rsid] * 4 # resolve_geno is at the crux - need to change to test CR? #logging.info("Call resolve_geno %s, %s", samp, str(sampleDict[samp][rsid])) geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid, samp, threshold) dataVals = geno_data[2].split(':') probVals = dataVals[self.get_probidx()].split(',') (probcall, intcall, outprob) = self.mkr_coll.get_call(probVals, threshold) #intcall, normalised = self.get_integer_call(intcall, geno_data[5], geno_data[6]) output_lines['combined'][idxoffset] = str(intcall) output_lines['combined'][idxoffset + 1] = str(outprob) output_lines['combined'][idxoffset + 2] = geno_data[0] output_lines['combined'][idxoffset + 3] = geno_data[4] for geno_data in sampleDict[samp][rsid]: dataVals = geno_data[2].split(':') probVals = dataVals[self.get_probidx()].split(',') (probcall, intcall, outprob) = self.mkr_coll.get_call( probVals, threshold) #intcall, normalised = self.get_integer_call(intcall, geno_data[5], geno_data[6]) output_lines[geno_data[0]][idxoffset] = str(intcall) output_lines[geno_data[0]][idxoffset + 1] = str(outprob) output_lines[geno_data[0]][idxoffset + 2] = geno_data[0] output_lines[geno_data[0]][idxoffset + 3] = geno_data[4] filled_output_lines[geno_data[0]] = True for assaytype in assaytypes: if assaytype in filled_output_lines: by_platform_data[assaytype].append( samp + "," + ",".join(output_lines[assaytype])) return by_platform_data def build_gen_data(self, rslist, sampleDict, threshold): data = [] hdrData = ["rsid"] for sampleid in sorted(sampleDict): hdrData.append(sampleid) hdrData.append(sampleid) hdrData.append(sampleid) hdrString = ' '.join(hdrData) data.append(hdrString) for rsid in rslist: line = rsid + " " for samp in sorted(sampleDict): geno_data = self.resolve_geno(sampleDict[samp][rsid], rsid, samp, threshold) if len(geno_data) > 2: dataVals = geno_data[2].split(':') probVals = dataVals[self.get_probidx()].split(',') line += ' '.join(probVals) + " " else: line += ' '.join(["0", "0", "0"]) + " " data.append(line[:-1]) return data def get_integer_call(self, intcall, afreq, bfreq, normalise=False): rtncall = intcall normalised = False #print rtncall, normalise, afreq, bfreq, if normalise == True: #print "normalising", if afreq < bfreq: normalised = True #print "LT", if rtncall == 2: #print "2", rtncall = 0 elif rtncall == 0: #print "0", rtncall = 2 #else: #print "1", #else: #print "GE", #print "return", rtncall return rtncall, normalised def resolve_geno(self, genlist, rsid, samp, threshold): maxprob = 0.0 maxidx = -1 if len(genlist) == 1: return genlist[0] elif len(genlist) > 1: for idx, gendata in enumerate(genlist): #if gendata[1] == 0: # #print "Decided on D Type:", genlist[idx][0], rsid, samp # return genlist[idx] dataVals = gendata[2].split(':') probVals = dataVals[self.get_probidx()].split(',') (probcall, intcall, outprob) = self.mkr_coll.get_call(probVals, threshold) if outprob > maxprob: maxprob = outprob maxidx = idx if maxprob > 0.0: #print "Decided on prob:", maxprob, maxidx, rsid, samp return genlist[maxidx] return [] def get_marker_summary_probs(self, rsid, threshold): marker_array = [] msg = "" #print "get_marker_summary_probs:", threshold docs = self.mkr_coll.get_marker_data_multi(rsid) for doc in docs: fpath = self.filedata_coll.get_filepath(doc["assaytype"], doc['chromosome']) # this is not ideal - need to get on top of this chromosome id thing chr = "%.2d" % int(doc["chromosome"]) rec, fullrec = self.mkr_coll.get_raw_marker_values( fpath, doc["rsid"], chr, doc['position']) #print "REC", rec prfx, sfx = self.vcfr.get_prfx_sfx_from_array(rec) (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.mkr_coll.get_genotype_probs(sfx, threshold) doc['selected'] = 1 doc['a_af'] = alleleAf doc['b_af'] = alleleBf doc['hwe_p'] = p_hwe doc['Missing'] = 0 if 'Missing' in gc_count_dict: doc['Missing'] = gc_count_dict['Missing'] marker_array.append(doc) if len(marker_array) == 0: msg = "Variant NOT FOUND - %s, " % (rsid) return (marker_array, msg) def get_marker_data(self, markerid, assaytype): return self.mkr_coll.get_marker_data(markerid, assaytype) def get_sample(self, sampleid): return self.sam_coll.get_sample(sampleid) def get_marker_totals(self): if len(self.marker_totals) == 0: self.marker_totals = self.mkr_coll.get_marker_totals() return self.marker_totals def get_sample_count(self, assaytype): if self.sample_count == -1: self.sample_count = self.sam_coll.get_count(assaytype) return self.sample_count def get_all_samples(self): return self.sam_coll.get_all_samples() def convert_to_prochi(self, cvt_value): """ Convert the supplied value to prochi by whichever method works (or return supplied value if all else fails) """ rtn_value = self.get_prochi_from_mprochi(cvt_value) if rtn_value == None: rtn_value = self.get_prochi_from_plateid(cvt_value) if rtn_value == None: rtn_value = cvt_value return rtn_value def get_prochi_from_mprochi(self, mprochi): """ Get the prochi_maps value for the supplied arg """ return self.prochi_coll.get_anochi_or_prochi_from_mprochi(mprochi) def get_prochi_from_plateid(self, plateid): """ Get the prochi_maps value for the supplied arg """ return self.prochi_coll.get_prochi_from_plateid(plateid) def get_converted_samples(self): return [ self.convert_to_prochi(samp) for samp in self.sam_coll.get_all_samples() ] def make_zipfile(self, sample_return_data, snp_return_data, uploadDir, zipfilename): """ moved here from views.py """ ares = {} for assaytype in sample_return_data: ares[assaytype] = '\n'.join(sample_return_data[assaytype]) zipname = uploadDir + "/" + zipfilename with ZipFile(zipname, 'w') as resZip: resZip.writestr('snp_summary.csv', snp_return_data, ZIP_DEFLATED) for assaytype in ares: resZip.writestr(assaytype + '_samples.csv', ares[assaytype], ZIP_DEFLATED) with open(zipname, 'r') as f: body = f.read() return (body) response = make_response(body) response.headers[ "Content-Disposition"] = "attachment; filename=" + zipfilename return (response) def get_rslist_file_data(self, filepath, threshold, download_list): msg = None try: f = open(filepath, "r") except IOError as e: msg = filepath + ":" + e.strerror return ([], [], msg) count = 0 rslist = [] for line in f: count += 1 if count > 500: msg = "line count for %s gt the limit (%d)" % (filepath, 500) return ([], [], msg) line = line.strip() elems = line.split() rslist.append(elems[0]) f.close() logging.info("get_rslist_file_data: %.2f", float(threshold)) return (self.get_rslist_data(rslist, threshold, download_list)) def get_rslist_data(self, input_rslist, threshold, download_list): msg = None snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n" data = [] rslist = [] assaytypes = {} Afreq = {} Bfreq = {} data_count = 0 impDict = {} for rsid in input_rslist: docs = self.mkr_coll.get_marker_data_multi(rsid) if len(docs) > 0: rslist.append(rsid) # handling SNPs on multiple platforms for doc in docs: #print doc # first get filepath #select_key = self.make_selection_key(doc["rsid"], doc["assaytype"]) #if select_key in download_list: fpath = self.filedata_coll.get_filepath( doc["assaytype"], doc["chromosome"]) # this is not ideal - need to get on top of this chromosome id thing chr = "%.2d" % int(doc["chromosome"]) # get raw marker data rec, fullrec = self.mkr_coll.get_raw_marker_values( fpath, doc["rsid"], chr, doc['position']) geno_count = 0 sample_count = 0 hwep = 0.0 #print "REC", rec, fpath prfx, sfx = self.vcfr.get_prfx_sfx_from_array(rec) (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf, p_hwe) = self.mkr_coll.get_genotype_probs(sfx, threshold) Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf data_count += 1 hwep = float(p_hwe) # add record to list of all records for the rslist data.append(doc["assaytype"] + '\t' + fullrec) if doc["assaytype"] not in assaytypes: assaytypes[doc["assaytype"]] = 1 imputed = 0 if "imputed" in doc: imputed = 1 impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed #print doc["rsid"], doc["assaytype"] if "cohort_1_hwe" in doc: hwep = doc["cohort_1_hwe"] snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % ( doc["rsid"], doc["assaytype"], doc["chromosome"], doc["position"], doc["alleleA"], alleleAf, doc["alleleB"], alleleBf, maf, imputed, float(geno_count) / sample_count, hwep, doc["info"]) #else: # not selected #logging.info("Line was unselected: %s", select_key) pdata = self.get_sample_values(data, data_count, rslist, impDict, assaytypes, Afreq, Bfreq, threshold) return (pdata, snpdata, msg) def get_sample_values(self, records, numrecs, rslist, impDict, platforms, Afreq, Bfreq, threshold): """Process vcf records NOTE: impDict keyed on a composite of rsid_platform """ #print 'START 1', len(records) #print impDict first_sample_idx = 9 + 1 # add 1 due to forcing assaytype in as col 0 samplesByAt = {} for platform in platforms: samplesByAt[platform] = [ self.convert_to_prochi(samp) for samp in self.sam_coll.get_samples(platform) ] # A dict of dicts of tables sampleDict = {} dupDict = {} dupcount = 0 for platform in samplesByAt: for samp in samplesByAt[platform]: if samp not in sampleDict: sampleDict[samp] = {} for rsid in rslist: sampleDict[samp][rsid] = [] #print '2' count = 0 rscount = 0 values_totals = 0 for line in records: linedata = line.split('\t') platform = linedata[0] chr = linedata[1] pos = linedata[2] rsid = linedata[3] alleleA = linedata[4] alleleB = linedata[5] #print "rec", rscount, rsid, platform, pos, len(linedata) if platform not in samplesByAt: print platform, "not cached" samplesByAt[platform] = [ self.convert_to_prochi(samp) for samp in self.sam_coll.get_samples(platform) ] for idx, elem in enumerate(linedata): #print idx, if idx >= first_sample_idx: arridx = idx - first_sample_idx if samplesByAt[platform][arridx] in sampleDict: sampleId = samplesByAt[platform][arridx] values_totals += 1 sampleDict[sampleId][rsid].append([ platform, impDict[rsid + "_" + platform], linedata[idx], alleleA, alleleB, Afreq[rsid + "_" + platform], Bfreq[rsid + "_" + platform] ]) print "x" rscount += 1 print '3' by_platform_data = self.build_csv_data(rslist, sampleDict, platforms, threshold) return (by_platform_data)
class _markers(): def __init__(self, db, filedata_coll, sample_coll, gwasdb, probidx=1): self.db = db self.gwasdb = gwasdb self.markers = db.markers self.calls = ["0/0", "0/1", "1/1", "Missing"] self.icalls = [0, 1, 2, -9] self.filedata_coll = filedata_coll self.sample_coll = sample_coll self.probidx = probidx self.vcfr = VCFrecord() def get_probidx(self): return self.probidx def get_marker_data(self, markerid, assaytype): """ Get the data for a genetic marker / assay platform combination """ query = {} query['rsid'] = markerid query['assaytype'] = assaytype try: doc = self.markers.find_one(query) except: print "Unexpected error:", sys.exc_info()[0] # can return 'None' if query fails return doc def get_marker_data_multi(self, markerid): """ Get the data for a genetic marker (DBSNP rs number or chrn:pos:I|D format) """ query = {} query['rsid'] = markerid docs = [] try: cursor = self.markers.find(query) except: print "Unexpected error:", sys.exc_info()[0] for doc in cursor: doc["samplecount"] = self.sample_coll.get_count(doc["assaytype"]) docs.append(doc) # can return [] if query fails return docs def get_marker_data_by_range(self, chr, start, end): """ Get the data for genetic markerwithin a range """ docs = [] msg = "" start_pos = int(start) end_pos = int(end) # Some basic sanity checking if (end_pos - start_pos) > 250000: msg = "Range is too great should be 250Kb or less [%d]" % ( end_pos - start_pos) return (docs, msg) if (end_pos - start_pos) < 0: msg = "Start pos is greater than End pos" return (docs, msg) query = {} query['chromosome'] = chr = "%.2d" % (int(chr)) query['position'] = {} query['position']['$gte'] = start_pos query['position']['$lte'] = end_pos print "RANGE QUERY", query try: cursor = self.markers.find(query) except: msg = "Unexpected error:" + sys.exc_info()[0] for doc in cursor: if len(doc["alleleA"]) > 10: doc["alleleA"] = doc["alleleA"][0:10] + " ..." if len(doc["alleleB"]) > 10: doc["alleleB"] = doc["alleleB"][0:10] + " ..." doc["samplecount"] = self.sample_coll.get_count(doc["assaytype"]) docs.append(doc) # can return [] if query fails if len(docs) == 0: msg = "Nothing found in range" return (docs, msg) def get_marker_totals(self): """ Use agg framework to get totals by CHR """ chr_totals = [] curs = self.db.markers.aggregate([{ "$group": { "_id": { "chr": "$chromosome", "at": "$assaytype" }, "mkrsPerChrom": { "$sum": 1 } } }, { "$sort": { "_id": 1 } }]) for doc in curs['result']: #print doc['_id'], doc['mkrsPerChrom'] chr_totals.append((doc['_id'], doc['mkrsPerChrom'])) return chr_totals def get_raw_marker_values(self, filepath, variantid, chr, posn): """ Access a vcf file to extract marker data """ tabixFile = pysam.Tabixfile(filepath) if int(chr) > 22: chr = "NA" else: chr = str(chr) rec = [] rtn_rec = "" try: records = tabixFile.fetch(chr, posn - 1, posn) except ValueError: chr = chr[1:] records = tabixFile.fetch(chr, posn - 1, posn) for record in records: data = self.vcfr.get_data_array(record) dvarid = self.vcfr.get_var_id_from_array(data) dposn = self.vcfr.get_posn_from_array(data) #print "%s-%s, %d-%d" % (variantid, dvarid, posn, int(dposn)) if (dvarid == variantid) and (int(dposn) == posn): rec = data rtn_rec = record return rec, rtn_rec def get_genotype_probs(self, sample_values, threshold, has_GP=True): """ Summarise marker_values based on probabilities TODO: deal with user-supplied threshold TODO: what to do when has_GP is false """ geno_count = 0 sample_count = 0 genotype_counts = {} for sample_value in sample_values: sample_count += 1 #print sample_value genoValues = sample_value.split(':') probVals = genoValues[self.get_probidx()].split(',') (key, ccode, maxprob) = self.get_call(probVals, threshold) genotype_counts[key] = genotype_counts.get(key, 0) + 1 #print "SAMPLE COUNT", sample_count #gc_count_str = [] hom1_ct = 0 hom2_ct = 0 het_ct = 0 if "0/0" in genotype_counts: hom1_ct = genotype_counts["0/0"] geno_count += hom1_ct if "0/1" in genotype_counts: het_ct = genotype_counts["0/1"] geno_count += het_ct if "1/1" in genotype_counts: hom2_ct = genotype_counts["1/1"] geno_count += hom2_ct #print "allele counts:", hom1_ct, het_ct, hom2_ct, sample_count mafr = maf(het_ct, hom1_ct, hom2_ct, geno_count) #print "mafr:", mafr AlleleAfr = af(het_ct, hom1_ct, hom2_ct, geno_count) #print "afr:", AlleleAfr AlleleBfr = af(het_ct, hom2_ct, hom1_ct, geno_count) #print "bfr:", AlleleBfr p_hwe = HWE_exact(het_ct, hom1_ct, hom2_ct, geno_count) #for gt in genotype_counts: # gc_count_str.append(gt + ": " + str(genotype_counts[gt])) return (genotype_counts, sample_count, geno_count, mafr, AlleleAfr, AlleleBfr, p_hwe) def get_call(self, probs, threshold): max_prob = 0.0 max_idx = 3 #print "Probs:", probs for idx, prob in enumerate(probs): if float(prob) > max_prob: max_prob = float(prob) max_idx = idx if (threshold != 0.0): #print 'threshold', threshold, max_prob if max_prob < threshold: #print 'LT threshold', threshold, max_prob max_idx = 3 return (self.calls[max_idx], self.icalls[max_idx], max_prob) def get_geno_data(self, rsid, sample_id, assaytype_list_posns): first_sample_idx = 9 geno_values = {} docs = self.get_marker_data_multi(rsid) for doc in docs: # first get filepath fpath = self.filedata_coll.get_filepath(doc["assaytype"], doc["chromosome"]) # this is not ideal - need to get on top of this chromosome id thing chr = "%.2d" % int(doc["chromosome"]) rec, fullrec = self.get_raw_marker_values(fpath, doc["rsid"], chr, doc['position']) if doc["assaytype"] in assaytype_list_posns: prfx, genodata = self.vcfr.get_prfx_sfx_from_array(rec) print "list posn", rsid, sample_id, assaytype_list_posns[ doc["assaytype"]] geno_values[sample_id + "_" + doc["assaytype"]] = genodata[ assaytype_list_posns[doc["assaytype"]]] return (geno_values)