def main(options): included_assaytypes = { "affy": 1, "illumina": 1, "broad": 1, "metabo": 1, "exome": 1 } godb = GoDb() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} count = 0 # Step 1 - get the list of entries for each rsid - one per assaytype vardocs = godb.get_multiple_variants(options.rsid) sampposns = godb.get_sample_posns(options.sampleid) for doc in vardocs: filepath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) rec = godb.get_variant_file_data(filepath, doc["chromosome"], doc["position"]) vcfr = VCFrecord(rec) prfx, sfx = vcfr.get_prfx_sfx() if doc["assaytype"] in sampposns: print "%s,%s,%s,%d,%s" % ( options.rsid, options.sampleid, doc["assaytype"], sampposns[doc["assaytype"]], sfx[sampposns[doc["assaytype"]]]) return count
def main(options): #included_assaytypes = {"biggertest":1, "bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"bigtest":1, "affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "metabo":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1, "exome":1} #included_assaytypes = {"affy":1, "illumina":1, "broad":1} #included_assaytypes = {"affy":1, "illumina":1} included_assaytypes = {"broad":1} #included_assaytypes = {"metabo":1} #included_assaytypes = {"affy":1} #included_assaytypes = {"bigtest":1} #included_assaytypes = {"biggertest":1} rsids = [] godb = GoDb() try: if options.snpfile != None: fh = open(options.snpfile, "r") rsids = load_snpfile_data(fh) else: rsids = options.rsids.split(",") except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) exit() except TypeError as e: print "Missing arguments ", e exit() except: logging.info("Unexpected error: %s", str(sys.exc_info())) sys.exit() # Step 0 - initialise db connection and instanciate helper objects mafh = Mafhelper() hweh = Hwehelper() # Data structures atype_list = [] atype_posns = {} marker_list = [] rsid_assaytypes = {} rsid_dict = {} rsid_prfx_dict = {} rsid_cr_dict = {} rsid_info_dict = {} hdr_pref = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"] # Step 1 - get the list of entries for each rsid - one per assaytype for rsid in rsids: #logging.info("Processing rsid = %s", rsid) docs = godb.get_multiple_variants(rsid) if docs.count() > 0: rsid_assaytypes[rsid] = [] else: logging.info("RSID %s NOTFOUND", rsid) #print docs # Step 1a - collect assaytypes and marker documents # At this point we're establishing a list order which must be observed throughout. for doc in docs: #logging.info("%s", str(doc)) if doc["assaytype"] not in included_assaytypes: continue if doc["assaytype"] not in atype_list: atype_list.append(doc["assaytype"]) rsid_assaytypes[rsid].append(doc) logging.info(str(atype_list)) # Step 2 - collect lists of prochis (sample ids) by assaytype prochi_list = [[]] * len(atype_list) for i, atype in enumerate(atype_list): atype_posns[atype] = i prochi_list[i] = godb.get_samples(atype) #logging.info("SAMP %d, %s, %s", i, atype, str(prochi_list[i])) mm = Multibuffermerge(prochi_list) # Step 3 - get combined col_header positions # combo is a dict {posn:colname} combo = mm.get_combined_positions() #print len(combo) # combocol is a list [colname1, colname2, ..., colname] again we keep the order of this intact combocol = mm.get_combined_columns() # Step 4 - for each variant by rsid for rsid in rsid_assaytypes: if rsid not in rsid_dict: rsid_prfx_dict[rsid] = [[]] * len(atype_list) rsid_dict[rsid] = [[]] * len(atype_list) rsid_cr_dict[rsid] = [[]] * len(atype_list) rsid_info_dict[rsid] = [[]] * len(atype_list) #print len(rsid_assaytypes[rsid]) for doc in rsid_assaytypes[rsid]: if options.prfx != None: fpath = godb.get_full_filepath(doc["assaytype"], doc["chromosome"], options.prfx) else: fpath = godb.get_filepath(doc["assaytype"], doc["chromosome"]) logging.info("Assaytype=%s fpath=%s", doc["assaytype"], fpath) result = godb.get_variant_file_data(fpath, doc["chromosome"], doc["position"]) if result != None: vcfr = VCFrecord(result) varid = vcfr.get_varid() if varid == rsid: rec = result maf, ma, cr = mafh.get_maf_and_cr(vcfr) # TODO - ALSO check maf, also apply QC filter at individual record level rsid_cr_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = cr rsid_dict[doc["rsid"]][atype_posns[doc["assaytype"]]] = rec logging.info("%s (%s), maf=%s, ma=%s, cr=%s" % (doc["rsid"], doc["assaytype"], maf, ma, cr)) #print combocol # Step 5 - execute the merge process print "\t".join(hdr_pref + combocol) count = 0 concordant = True for rsid in rsid_dict: if len(rsid_dict[rsid][0]) > 0: if options.check == 'Y': concordant = mm.check_concordancies(rsid_dict[rsid], atype_list, options.chipval) if concordant == True: comborec = mm.get_combined_array(rsid_dict[rsid], rsid_cr_dict[rsid], atype_list) vcfr = VCFrecord(rsid_dict[rsid][0]) prfx,sfx = vcfr.get_prfx_sfx() if len(prfx) > 0: logging.info("PRFX = %s, for %s", str(prfx), rsid) prfx[8] += ":AT" outrec = prfx + comborec print "\t".join(outrec) count += 1 else: logging.info("RSID %s NOTFOUND (2)", rsid) pass else: logging.info("Concordancy check fail for - %s" % (rsid)) #chi_test_count, allele_disc_count, overlap_count, cr_check_count = mm.get_counts() #logging.info("Overlap check count = %d, cr_check_count = %d", overlap_count, cr_check_count) chi_test_count, allele_disc_count, overlap_count = mm.get_counts() logging.info("CHI test count = %d, Allele discord count = %d, Overlap check count = %d", chi_test_count, allele_disc_count, overlap_count) return count