Пример #1
0
 def get_variant_summary_probs(self, rsid, threshold):
     variant_array = []
     msg = ""
     docs = self.var_coll.get_variant_data_multi(rsid)
     for doc in docs:
         # always force chromosome to 2 digits
         chromosome = "%.2d" % int(doc["chromosome"])
         fpath = self.filepaths_coll.get_filepath(doc["assaytype"],
                                                  chromosome)
         fullrec = self.var_coll.get_raw_variant_values(
             fpath, chromosome, doc['position'])
         vcfr = VCFrecord(fullrec)
         prfx, sfx = vcfr.get_prfx_sfx()
         probidx = vcfr.get_probidx()
         (gc_count_dict, sample_count, geno_count, maf, alleleAf, alleleBf,
          p_hwe) = self.var_coll.get_genotype_probs(sfx, threshold, probidx)
         doc['selected'] = 1
         doc['a_af'] = alleleAf
         doc['b_af'] = alleleBf
         doc['hwe_p'] = p_hwe
         doc['Missing'] = 0
         if 'Missing' in gc_count_dict:
             doc['Missing'] = gc_count_dict['Missing']
         variant_array.append(doc)
     if len(variant_array) == 0:
         msg = "Variant NOT FOUND - %s, " % (rsid)
     return (variant_array, msg)
Пример #2
0
    def check_concordancies(self, data_list, assays, chipval):
        hwe_values = [0.0] * len(data_list)
        maf_values = [0.0] * len(data_list)
        obs = [0.0] * 3
        exp = [0.0] * 3
        allele_ref_1 = ""
        allele_alt_1 = ""
        allele_ref_2 = ""
        allele_alt_2 = ""
        #print "CHECK_CONC:", len(data_list)
        for i, vcf_record in enumerate(data_list):
            if len(vcf_record) > 0:
                vcfr = VCFrecord(vcf_record)
                probidx = vcfr.get_probidx()
                homref_count, het_count, homalt_count, nc_count, miss_count = self.vcfr.get_allele_counts_from_array(
                    data)
                allele_a, allele_b = vcfr.get_alleles()
                if allele_ref_1 == "":
                    allele_ref_1 = allele_a
                    allele_alt_1 = allele_b
                    # Add 1 to prevent 0-divide
                    obs[0] = homref_count + 1
                    obs[1] = het_count + 1
                    obs[2] = homalt_count + 1
                else:
                    allele_ref_2 = allele_a
                    allele_alt_2 = allele_b
                    exp[0] = homref_count + 1
                    exp[1] = het_count + 1
                    exp[2] = homalt_count + 1
                    if (allele_ref_1 != allele_ref_2) or (allele_alt_1 !=
                                                          allele_alt_2):
                        varid = vcfr.get_varid(data)
                        posn = vcfr.get_posn(data)
                        self.allele_discord_count += 1
                        logging.info(
                            "Allele discordancy: assay1=%s, assay2=%s, varid=%s, posn=%d, ref1=%s, alt1=%s, ref2=%s, alt2=%s",
                            assays[0], assays[i], varid, int(posn),
                            allele_ref_1, allele_alt_1, allele_ref_2,
                            allele_alt_2)
                        #print "Allele discord"
                        return False

                    chi_stat, chi_p_value = chisquare(obs, f_exp=exp)
                    varid = vcfr.get_varid(data)
                    posn = vcfr.get_posn(data)
                    if chi_p_value < chipval:
                        self.chisq_count += 1
                        logging.info(
                            "CHI SQ test REJECT: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, chipval=%e, obs=%s, exp=%s, at %d",
                            assays[0], assays[i], varid, int(posn), chi_stat,
                            chi_p_value, chipval, str(obs), str(exp), i)
                        #print "CHISQ discord"
                        return False
                    logging.info(
                        "CHI SQ test OK: assay1=%s, assay2=%s, varid=%s, posn=%d, chistat=%f, p_val=%e, obs=%s, exp=%s, at %d",
                        assays[0], assays[i], varid, int(posn), chi_stat,
                        chi_p_value, str(obs), str(exp), i)

        return True
Пример #3
0
    def get_combined_array(self,
                           buffer_list,
                           cr_list,
                           assay_list,
                           threshold=0.9):
        """
    For each list of data, for each element of list of data:
    1) Find the col header from the corresonding file_position element
    2) Use the col_header to find the combined postion
    3) Place the data_element in the combined postion *
    TODO - conflict resolution, what to do if a slot is already occupied
    TODO - CR check
    """
        #print "COMBO", self.combined_positions
        #
        #print "ASSAY_LIST: %s" % (str(assay_list))
        assay_posns = {}

        for i, assaytype in enumerate(assay_list):
            assay_posns[i] = assaytype

        #print "ASSAY_POSNS: %s" % (str(assay_posns))

        combo_array = ["."] * len(self.combined_positions)
        #print "BUFFL", len(buffer_list)
        for i, vcf_record in enumerate(buffer_list):
            if len(vcf_record) > 0:
                #print "asstp: %d, %s" % (i, assay_list[i])
                vcfr = VCFrecord(vcf_record)
                prfx, data_list = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                rsid = vcfr.get_varid()
                hasAT = vcfr.has_fmt("AT")
                for j, dataelem in enumerate(data_list):
                    if data_list[j] != ".":
                        cpos = self.combined_positions[self.file_positions[i]
                                                       [j]]
                        geno = self.call_geno_for_threshold(
                            data_list[j], probidx, threshold)
                        if (hasAT == False):
                            geno = geno + ":" + self.assay_abbrev[
                                assay_list[i]]
                        if combo_array[cpos] != ".":
                            self.geno_overlap_count += 1
                            #print "OVERLAP %s:%s - %s vs %s" % (rsid, self.file_positions[i][j], combo_array[cpos], geno)
                            geno = self.call_genotype(combo_array[cpos], geno,
                                                      probidx)
                        combo_array[cpos] = geno

        return combo_array
Пример #4
0
    def get_rslist_data(self, input_rslist, threshold, download_list):
        msg = None
        snpdata = "SNPId,AssayType,chr,pos,REF,REF_fr,ALT,ALT_fr,MAF,Imputed,CallRate,HWE_pval,Info\n"

        data = []
        assaytypelist = []
        probidxlist = []
        rslist = []
        assaytypes = {}
        Afreq = {}
        Bfreq = {}

        data_count = 0
        impDict = {}
        for rsid in input_rslist:
            docs = self.var_coll.get_variant_data_multi(rsid)
            if len(docs) > 0:
                rslist.append(rsid)
            # handling SNPs on multiple platforms
            for doc in docs:
                # always force chromosome to 2 digits
                chromosome = "%.2d" % int(doc["chromosome"])
                # first get filepath
                fpath = self.filepaths_coll.get_filepath(
                    doc["assaytype"], chromosome)
                # get raw variant data
                fullrec = self.var_coll.get_raw_variant_values(
                    fpath, chromosome, doc['position'])
                geno_count = 0
                sample_count = 0
                hwep = 0.0
                vcfr = VCFrecord(fullrec)
                prfx, sfx = vcfr.get_prfx_sfx()
                probidx = vcfr.get_probidx()
                (gc_count_dict, sample_count, geno_count, maf, alleleAf,
                 alleleBf, p_hwe) = self.var_coll.get_genotype_probs(
                     sfx, threshold, probidx)
                Afreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleAf
                Bfreq[doc["rsid"] + "_" + doc["assaytype"]] = alleleBf
                data_count += 1
                hwep = float(p_hwe)

                assaytypelist.append(doc["assaytype"])
                data.append(vcfr)
                if doc["assaytype"] not in assaytypes:
                    assaytypes[doc["assaytype"]] = 1

                imputed = 0
                if "imputed" in doc:
                    imputed = 1
                if "info" in doc:
                    if doc["info"] != 1.0:
                        imputed = 1
                impDict[doc["rsid"] + "_" + doc["assaytype"]] = imputed
                snpdata += "%s,%s,%s,%d,%s,%s,%s,%s,%s,%d,%.5f,%.5f,%.5f\n" % (
                    doc["rsid"], doc["assaytype"], doc["chromosome"],
                    doc["position"], doc["alleleA"],
                    alleleAf, doc["alleleB"], alleleBf, maf, imputed,
                    float(geno_count) / sample_count, hwep, doc["info"])

        pdata = self.get_sample_values(assaytypelist, data, data_count, rslist,
                                       impDict, assaytypes, Afreq, Bfreq,
                                       threshold)
        return (pdata, snpdata, msg)
Пример #5
0
def main(options):
    hdrData = ["id"]
    sampleDict = {}
    colPosns = {}
    RefAlleleDict = {}
    AltAlleleDict = {}
    count = 0

    mafh = Mafhelper()

    for line in sys.stdin:
        line = line.strip()
        if (line.startswith('##')):
            pass
        else:
            vcfr = VCFrecord(line)
            prfx, sfx = vcfr.get_prfx_sfx()
            #print prfx
            if (line.startswith('#')):
                # Parse out the header record.
                for i, col_hdr in enumerate(sfx):
                    colPosns[i] = col_hdr
                    sampleDict[col_hdr] = []
            else:
                flip = False
                varid = vcfr.get_varid_ukb()
                #logging.info("varid=%s", varid)
                ref, alt = vcfr.get_alleles()
                probidx = vcfr.get_probidx()
                hdr_allele = alt
                homref_count, het_count, homalt_count, nc_count, miss_count = vcfr.get_allele_counts(
                )
                call_count = homref_count + het_count + homalt_count
                maf, ma = mafh.maf(het_count, homref_count, ref, homalt_count,
                                   alt, nc_count)
                RefAlleleDict[varid] = ref
                AltAlleleDict[varid] = alt
                #if ma == ref:
                #  flip = True
                #  hdr_allele = ref
                #  logging.info("FLIP for %s, %s, %s", varid, ref, alt)
                hdrData.append(varid)
                for i, str_geno in enumerate(sfx):
                    if str_geno != ".":
                        geno = str_geno.split(":")
                        max_prob, max_idx = get_max_prob(geno, probidx)
                        i_call = icalls[geno[0]]
                        if flip == True:
                            if i_call == "0":
                                i_call == "2"
                            elif i_call == "2":
                                i_call = "0"
                        sampleDict[colPosns[i]].append(str(i_call))
                    else:
                        sampleDict[colPosns[i]].append("")

    print ",".join(hdrData)
    for samp in sampleDict:
        count += 1
        print ",".join([samp] + sampleDict[samp])
    return count