예제 #1
0
def correlations(outputdir, genos, probesetfreeze):
    print probesetfreeze
    probesetfreezeid = probesetfreeze[0]
    probesetfreezename = probesetfreeze[1]
    probesetfreezefullname = probesetfreeze[2]
    #
    outputfile = open("%s/%d_%s.txt" % (outputdir, probesetfreezeid, probesetfreezename), "w+")
    outputfile.write("%s\t" % "ProbeSet Id")
    outputfile.write("%s\t" % "ProbeSet Name")
    outputfile.write("%s\t" % "Geno Name")
    outputfile.write("%s\t" % "Overlap Number")
    outputfile.write("%s\t" % "Pearson r")
    outputfile.write("%s\t" % "Pearson p")
    outputfile.write("%s\t" % "Spearman r")
    outputfile.write("%s\t" % "Spearman p")
    outputfile.write("\n")
    outputfile.flush()
    #
    probesetxrefs = probesets.get_probesetxref(probesetfreezeid)
    print "Get %d probesetxrefs" % (len(probesetxrefs))
    #
    for probesetxref in probesetxrefs:
        #
        probesetid = probesetxref[0]
        probesetdataid = probesetxref[1]
        probeset = probesets.get_probeset(probesetid)
        probesetname = probeset[1]
        probesetdata = probesets.get_probesetdata(probesetdataid)
        probesetdata = zip(*probesetdata)
        probesetdata = utilities.to_dic([strain.lower() for strain in probesetdata[1]], probesetdata[2])
        #
        for geno in genos:
            genoname = geno['locus']
            outputfile.write("%s\t" % probesetid)
            outputfile.write("%s\t" % probesetname)
            outputfile.write("%s\t" % genoname)
            #
            dic1 = geno['dicvalues']
            dic2 = probesetdata
            keys, values1, values2 = utilities.overlap(dic1, dic2)
            rs = calculate.correlation(values1, values2)
            #
            outputfile.write("%s\t" % len(keys))
            outputfile.write("%s\t" % rs[0][0])
            outputfile.write("%s\t" % rs[0][1])
            outputfile.write("%s\t" % rs[1][0])
            outputfile.write("%s\t" % rs[1][1])
            outputfile.write("\n")
            outputfile.flush()
    #
    outputfile.close()
예제 #2
0
t_npm = array([t1_npm, t2_npm]).transpose() 

# Identify dwells with momentum unloads
print('filtering dumps, nsm/ssm events, short dwells, perigees, and outliers...')
aounload = fetch.Msid('AOUNLOAD', t_start, t_stop)
dump = aounload.vals != 'MON '
if any(dump[:1]) | any(dump[-2:]):
    raise StandardError('Timeframe must not start or end with a momentum dump.')
i1_dump = ~dump[:-1] & dump[1:]
i2_dump = dump[:-1] & ~dump[1:]
if sum(i1_dump) != sum(i2_dump):
    raise StandardError('Dump start and stop times do not correlate.')
t1_dump = aounload.times[nonzero(i1_dump)[0] + 1]
t2_dump = aounload.times[nonzero(i2_dump)[0] + 1]
t_dump = array([t1_dump, t2_dump]).transpose()
bad_dump = overlap(t_npm, t_dump)

# Identify dwells during NSM and SSM events
t_nsm = str_to_secs(nsm)
bad_nsm = overlap(t_npm, t_nsm)
t_ssm = str_to_secs(ssm)
bad_ssm = overlap(t_npm, t_ssm)

# Identify dwells that are too short for accurate reading
bad_short = (t_npm[:,1] - t_npm[:,0]) < min_dur

# Identify dwells with low altitude (gravity gradient torques will dominate)
i1_npm_ind = nonzero(i1_npm)[0]
i2_npm_ind = nonzero(i2_npm)[0]
min_dwell_alt = array([min(x['DIST_SATEARTH'].vals[i1_npm_ind[i]:i2_npm_ind[i]]) for i in range(len(i1_npm_ind))])
bad_low = min_dwell_alt < min_alt
예제 #3
0
    ]].values
    is_val = np.full(n_frg, fill_value=True)
    fi = 0
    # TODO: better merging strategy is to keep top MQs, but that requires pairwise comparison of all fragments => expensive
    while fi < n_frg - 1:
        if fi % 1e6 == 0:
            print('\t{:12,d} fragments are checked for overlap, to be merged.'.
                  format(fi))
        if (frg_np[fi, 0] != frg_np[fi + 1, 0]) or (frg_np[fi, 1] !=
                                                    frg_np[fi + 1, 1]):
            fi += 1
            continue

        # check overlap (ignoring strand)
        fi_be = fi
        while overlap(frg_np[fi_be, 2:4], frg_np[fi + 1:fi + 2, 2:4])[0]:
            fi += 1
            if fi == n_frg - 1:
                break
        if fi_be != fi:
            # bam_pd.loc[fi_be:fi]
            # frg_np[fi_be:fi + 1, :]
            frg_np[fi_be, 2] = np.min(frg_np[fi_be:fi + 1, 2])
            frg_np[fi_be, 3] = np.max(frg_np[fi_be:fi + 1, 3])
            frg_np[fi_be, 4] = np.max(frg_np[fi_be:fi + 1, 4])
            frg_np[fi_be, 5] = fi - fi_be
            is_val[fi_be + 1:fi + 1] = False
        fi += 1
    print('\t{:,d} overlapping fragments are merged.'.format(np.sum(~is_val)))
    bam_pd[['map_start', 'map_end', 'mq', 'map_#merge']] = frg_np[:, 2:6]
    bam_pd = bam_pd.loc[is_val].reset_index(drop=True)
예제 #4
0
with pysam.AlignmentFile(inp_args.input_bam, 'rb') as src_fid:
    # hint: no need to check continuity (uniqueness) of the read_ids, we will do this on the make_dataset script: better use of memory
    for rd_idx, read in enumerate(get_read(src_fid)):
        if rd_idx % 1e6 == 0:
            print('\t{:,d} reads are processed'.format(rd_idx))
        n_read += 1

        # check overlap with probes/VPs
        hit_vps = {}
        hit_overlap_size = np.zeros(n_expr + 1, dtype=int)
        for frg in read:
            frg_crd = [
                chr2nid[frg.reference_name], frg.reference_start,
                frg.reference_end
            ]
            is_ol = overlap(frg_crd, vp_crds)
            if any(is_ol):
                vp_idx = np.where(is_ol)[0]
                assert len(
                    vp_idx
                ) == 1, '[e] A single fragment is mapped to multiple viewpoints!'
                vp_idx = vp_idx[0]

                hit_overlap_size[vp_idx] += frg.get_overlap(
                    vp_crds[vp_idx, 1], vp_crds[vp_idx, 2])
                if vp_idx not in hit_vps:  # coloring is based on the first fragment that maps to the VP
                    clr_ratio = float(
                        np.mean(frg_crd[1:]) - vp_crds[vp_idx, 1]) / (
                            vp_crds[vp_idx, 2] - vp_crds[vp_idx, 1])
                    hit_vps[vp_idx] = {
                        'color':
예제 #5
0
        sig_pd = bin_pd.loc[is_sig].reset_index(drop=True)
        print('#bins: {:4d} loaded, {:4d} enriched '.format(
            bin_pd.shape[0], sig_pd.shape[0]),
              end='')
        del bin_pd  # bin_pd.loc[is_sig] bin_pd.loc[~is_sig]
        if len(sig_pd) == 0:
            print()
            continue

        # marking neighbor bins
        enrich_crd = sig_pd[['chr', 'pos', 'pos']].values
        enrich_crd[:, 2] += bin_width
        nei_idxs = np.arange(len(sig_pd))
        for ci in range(len(sig_pd)):
            has_ol = overlap(enrich_crd[ci],
                             enrich_crd,
                             offset=inp_args.neighborhood_width)
            if np.sum(has_ol) > 1:
                is_sel = np.isin(nei_idxs,
                                 nei_idxs[has_ol])  # sig_pd.loc[has_ol]
                nei_idxs[is_sel] = np.min(
                    nei_idxs[is_sel])  # sig_pd.loc[is_sel]
        sig_pd['nei_idx'] = nei_idxs
        del enrich_crd

        # merging neighbor bins
        sig_pd = sig_pd.sort_values(by='#cpt_zscr',
                                    ascending=False).reset_index(drop=True)
        nei_grp = sig_pd.groupby(by='nei_idx', sort=False)
        for rank_idx, (nei_idx, nei_pd) in enumerate(nei_grp):
            itm_crd = [
# bin_info['cmb_zscr'] = np.maximum(norm.ppf(1 - bin_info['cmb_qval']), 0)

# correct p-values for multiple testing
# TODO: Correction factor for cis-bins is too strong; there are not many #background to reach small p-values
# bin_info['#cpt_qval'] = np.minimum(bin_info['#cpt_pval'] * bin_info.shape[0], 1)
# bin_info['cmb_qval'] = np.minimum(bin_info['cmb_pval'] * bin_info.shape[0], 1)

####################################################################################################################
# Output all windows
if inp_args.store_all_enrichments:
    os.makedirs(os.path.dirname(out_fpath_all), exist_ok=True)
    bin_info.to_csv(out_fpath_all, sep='\t', na_rep='nan', index=False, compression='gzip')
    print('All bins scores are saved to: {:s}'.format(out_fpath_all))

# Output top windows
is_roi = overlap([vp_info['vp_chr'], vp_info['vp_be'], vp_info['vp_en']],
                 bin_info[['chr', 'pos', 'pos']].values, offset=inp_args.roi_width)
is_enriched = bin_info['#cpt_zscr'] >= 5.0
bin_nroi = bin_info.loc[(~is_roi) & is_enriched].sort_values(by='#cpt_zscr', ascending=False).reset_index(drop=True)
del is_roi, is_enriched
os.makedirs(os.path.dirname(out_fpath_top), exist_ok=True)
bin_nroi.to_csv(out_fpath_top, sep='\t', na_rep='nan', index=False)
print('{:d} bins with elevated #captures (i.e., z-score >= 5.0) are stored in: {:s}'.format(len(bin_nroi), out_fpath_top))
# assert bin_nroi['#cpt_zscr'].iat[-1] < 8.0, 'Some "top bins" could be cropped, increase #top_bins that are stored (current={:d}).'.format(inp_args.n_topbins)

# Plotting
if inp_args.draw_plot:
    plt.figure(figsize=(25, 7))
    ax_h = plt.gca()
    ax_h.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, loc: '{:,.0f}'.format(x)))

    # Plot important areas
예제 #7
0
def bxd_geno_pheno_correlations(file):
    #
    file = open(file, 'w')
    inbredsetid = 1
    genofile = "/home/leiyan/gn/web/genotypes/BXD.geno"
    #
    t = genotypes.load_genos(genofile)
    genostrains = t[0]
    genos = t[1]
    print("From geno file, get %d strains" % (len(genostrains)))
    print("From geno file, get %d genos" % (len(genos)))
    #
    publishxrefs = phenotypes.get_publishxrefs(inbredsetid)
    print("get %d publishxrefs" % (len(publishxrefs)))
    #
    file.write("%s\t" % "PhenotypeID")
    file.write("%s\t" % "PhenotypeName")
    file.write("%s\t" % "MarkerName")
    file.write("%s\t" % "MarkerChromosome")
    file.write("%s\t" % "MarkerCentimorgan")
    file.write("%s\t" % "MarkerMb")
    file.write("%s\t" % "PearsonCorrelation")
    file.write("%s\t" % "PearsonPvalue")
    file.write("%s\t" % "SpearmanCorrelation")
    file.write("%s\t" % "SpearmanPvalue")
    file.write("%s\t" % "Number_of_BXDs_used")
    file.write("\n")
    file.flush()
    #
    for publishxref in publishxrefs:
        #
        publishxrefid = publishxref[0]
        phenotypeid = publishxref[1]
        phenotype = phenotypes.get_phenotype(phenotypeid)
        publicationid = publishxref[2]
        publication = phenotypes.get_publication(publicationid)
        publishdataid = publishxref[3]
        publishdata = phenotypes.get_publishdata(publishdataid)
        publishdata = zip(*publishdata)
        if len(publishdata) != 3:
            print("publishdata - %s: %d" % (publishxrefid, len(publishdata)))
            continue
        publishdata = utilities.to_dic([strain.lower() for strain in publishdata[1]], publishdata[2])
        #
        for geno in genos:
            #
            dic1 = geno['dicvalues']
            dic2 = publishdata
            keys, values1, values2 = utilities.overlap(dic1, dic2)
            rs = calculate.correlation(values1, values2)
            #
            file.write("%s\t" % publishxrefid)
            file.write("%s;%s;%s\t" % (phenotype[0], phenotype[1], phenotype[2]))
            file.write("%s\t" % geno['locus'])
            file.write("%s\t" % geno['chr'])
            file.write("%s\t" % geno['cm'])
            file.write("%s\t" % geno['mb'])
            file.write("%s\t" % rs[0][0])
            file.write("%s\t" % rs[0][1])
            file.write("%s\t" % rs[1][0])
            file.write("%s\t" % rs[1][1])
            file.write("%s\t" % len(keys))
            file.write("\n")
            file.flush()
    #
    file.close()
    enrichments = pd.read_csv(enrichment_fpath, sep='\t')
    print('\t[{:2d}/{:d}] Loading enrichments in: {:s}'.format(
        ei + 1, len(vp_infos), enrichment_fpath))

    # filtering enrichments
    is_sel = enrichments['bin_width'].isin(inp_args.bin_widths)
    enrichments = enrichments.loc[is_sel].reset_index(drop=True)
    enrichments = enrichments.sort_values(
        by=inp_args.enrichment_score, ascending=False).reset_index(drop=True)

    # finding overlapping calls across bin_widths/Gaussian_widths/n_steps
    enrich_crd = enrichments[['enrich_chr', 'enrich_beg', 'enrich_end']].values
    ovl_idxs = np.arange(len(enrich_crd))
    for ci in range(len(enrich_crd)):
        has_ol = overlap(enrich_crd[ci],
                         enrich_crd,
                         offset=inp_args.neighborhood_width)
        if np.sum(has_ol) > 1:
            is_in = np.isin(ovl_idxs, ovl_idxs[has_ol])  # clc_pd.loc[has_ol]
            ovl_idxs[is_in] = np.min(ovl_idxs[is_in])  # clc_pd.loc[is_in]
    enrichments['ovl_idx'] = np.unique(ovl_idxs, return_inverse=True)[1]
    del enrich_crd, ovl_idxs

    # select significant calls: The overlap is determined, we dont need insignificant calls anymore
    is_cis = enrichments['vp_chr'] == enrichments['enrich_chr']
    is_sig = (is_cis & (enrichments[inp_args.enrichment_score] >= inp_args.significance_threshold_cis)) | \
             (~is_cis & (enrichments[inp_args.enrichment_score] >= inp_args.significance_threshold))
    enrichments = enrichments.loc[is_sig].reset_index(drop=True)
    del is_cis, is_sig

    # combine across scales