Пример #1
0
def learn_falseamp(data, earthmodel, false_dets):
  
  (start_time, end_time, detections, leb_events, leb_evlist,
   site_up, sites, phasenames, phasetimedef, phaseprop,
   sitenames, ttime_prefix, ddrange_file, qfvc_file,
   hydro_dir, infra_dir) = data
  
  numsites = earthmodel.NumSites()
  truecnt = [0 for s in xrange(numsites)]
  site_raw = [[] for s in xrange(numsites)]
  
  for detnum in false_dets:
    siteid = int(detections[detnum, DET_SITE_COL])
    amp = detections[detnum, DET_AMP_COL]
    if amp > 0:
      site_raw[siteid].append(np.log(amp))
      
  phaseid = 0
  
  for evnum, event in enumerate(leb_events):

    if event[EV_MEDIUM_COL] != MEDIUM_SEISMIC or event[EV_DEPTH_COL] > 10 \
           or event[EV_MB_COL] < 2:
      continue
    
    for ph, detnum in leb_evlist[evnum]:
      
      if ph==phaseid:

        siteid = int(detections[detnum, DET_SITE_COL])

        truecnt[siteid] += 1

  print "False Amp"

  all_loc, all_scale = [], []
  
  for siteid, raw in enumerate(site_raw):
    if truecnt[siteid] > 1000:
      print "siteid", siteid,

      loc, scale = cauchy.fit(raw)

      print "siteid", siteid, "Cauchy", loc, scale

      all_loc.append(loc)
      all_scale.append(scale)

  print "Gaussian of scale:", norm.fit(all_loc)
  print "InvGamma of scale:", invgamma_fit(all_scale)
Пример #2
0
    def fit_cauchy(self, sample=131072):
        """Fit target-specific Cauchy distributions, and save to HDF5"""

        # sample SNPs
        sample_snps = random.sample(list(self.snp_indexes.values()), sample)

        # sort by chr
        chr_sample_snps = {}
        for ci, si in sample_snps:
            chr_sample_snps.setdefault(ci, []).append(si)
        for ci in chr_sample_snps:
            chr_sample_snps[ci] = sorted(chr_sample_snps[ci])

        # read SNPs
        sad = []
        for ci, csnps in chr_sample_snps.items():
            print("Reading %s" % ci, flush=True)
            sad.append(self.chr_sad5[ci].sad_matrix[csnps])
        sad = np.concatenate(sad).astype("float32")

        # initialize fit parameters
        self.target_cauchy_fit_loc = np.zeros(self.num_targets)
        self.target_cauchy_fit_scale = np.zeros(self.num_targets)

        # fit parameters
        for ti in range(self.num_targets):
            print(" Fitting t%d" % ti, flush=True)
            cp = cauchy.fit(sad[:, ti])
            self.target_cauchy_fit_loc[ti] = cp[0]
            self.target_cauchy_fit_scale[ti] = cp[1]

        # write to HDF5
        for chrm, sad5 in self.chr_sad5.items():
            sad5.sad_h5_open.close()
            sad5.sad_h5_open = h5py.File(sad5.sad_h5_file, "r+")
            if "target_cauchy_fit_loc" in sad5.sad_h5_open:
                del sad5.sad_h5_open["target_cauchy_fit_loc"]
                del sad5.sad_h5_open["target_cauchy_fit_scale"]
            sad5.sad_h5_open.create_dataset("target_cauchy_fit_loc",
                                            data=self.target_cauchy_fit_loc)
            sad5.sad_h5_open.create_dataset("target_cauchy_fit_scale",
                                            data=self.target_cauchy_fit_scale)
            sad5.sad_h5_open.close()
            sad5.sad_h5_open = h5py.File(sad5.sad_h5_file, "r")
                        cv=5,
                        verbose = 1,
                        n_jobs = -2)
    grid.fit(dist_filt[:, None])

    ### KDE representation
    kde = KernelDensity(bandwidth=grid.best_params_['bandwidth'], 
                        kernel='gaussian')
    kde.fit(dist_filt[:, None])

    logprob_kde = kde.score_samples(x_d[:, None])
    pdfkde = np.exp(logprob_kde)
    
        
    ### Fit a Cauchy distribution 
    loc,scale = cauchy.fit(dist_filt)
    ncauchy = cauchy.pdf(x_d,loc=loc,scale=scale)
    
    ### Print info and plot
    print(idx,dmin,dmax,np.abs(np.mean(dist)),grid.best_params_['bandwidth'],data['metric'][idx])
    p = ax.plot(x_d,pdfkde)
    axins.plot(wl_vec,f_eps(wl_vec,1))
    
    if plot_cauchy:
        ax.plot(x_d,ncauchy,linestyle='dashed',color=p[-1].get_color())
    
    idxM = np.argmax(pdfkde)
    ax.text(x_d[idxM],pdfkde[idxM],data['metric'][idx])
    
    ### Maximum of all of the PDFs
    maxpdf = max(maxpdf,np.max(pdfkde))
Пример #4
0
from scipy.stats import cauchy, norm    # noqa

residuals = gandalfs.zenith - primaries.zenith
cut = (gandalfs['lambda'] < l) & (np.abs(residuals) < 2 * np.pi)
residuals = residuals[cut]
info[cut]

# convert rad -> deg
residuals = residuals * 180 / np.pi

pi = 180
# x axis for plotting
x = np.linspace(-pi, pi, 1000)

c_loc, c_gamma = cauchy.fit(residuals)
fwhm = 2 * c_gamma

g_mu_bad, g_sigma_bad = norm.fit(residuals)
g_mu, g_sigma = norm.fit(residuals[np.abs(residuals) < 10])

plt.hist(residuals, bins='auto', label='Histogram', normed=True, alpha=.7)
plt.plot(
    x,
    cauchy(c_loc, c_gamma).pdf(x),
    label='Lorentz: FWHM $=${:.3f}'.format(fwhm),
    linewidth=2
)
plt.plot(
    x,
    norm(g_mu_bad, g_sigma_bad).pdf(x),
sigarr_div = np.abs(sigarr_div)
ijarr = np.array(ijarr)

### Sample the nwl * (nwl-1)/2 normal distributions
Zarr = np.zeros(ncomb)
for m in range(ncomb):
    sigrand = sigarr[m]
#    sigrand = 10
#    sigrand = sigarr_expon[m]
#    sigrand = sigarr_div[m]
#    sigrand = np.random.choice(sigarr_expon)
    Xrand = norm.rvs(loc=0,scale=sigrand,size=1)
    Zarr[m] = Xrand

### Fit a Cauchy distribution
loc,sca = cauchy.fit(Zarr)
locnorm, scanorm = norm.fit(Zarr)
dft, loct, scat = t.fit(Zarr)

### Compound distribution
#sigarr[:] = sigrand
#weights = 1/sigarr_expon
#weights = weights / np.sum(weights)
weights = np.ones_like(sigarr)
pdf_cmb = lambda x: np.sum(weights * 1/sigarr * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr**2))
#pdf_cmb  = lambda x: np.sum(weights * 1/sigarr_expon * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr_expon**2))
#pdf_cmb  = lambda x: np.sum(weights * 1/sigarr_div * 1/np.sqrt(2*np.pi) * np.exp(-1/2*x**2/sigarr_div**2))

### Buhlmann
#v2 = np.var(sigarr)
Пример #6
0
def main():
    usage = 'usage: %prog [options] arg'
    parser = OptionParser(usage)
    parser.add_option('-o', dest='out_dir', default='sad_norm')
    parser.add_option(
        '-s',
        dest='sample',
        default=100000,
        type='int',
        help='Number of SNPs to sample for fit [Default: %default]')
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide SAD HDF5 path')
    else:
        sad_h5_path = args[0]

    # retrieve chromosome SAD HDF5 files
    chr_sad_h5_files = sorted(glob.glob('%s/*/sad.h5' % sad_h5_path))
    assert (len(chr_sad_h5_files) > 0)

    # clean out any existing fits
    # count SNPs across chromosomes
    num_snps = 0
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+')

        # delete fit params
        if 'target_cauchy_fit_loc' in chr_sad_h5.keys():
            del chr_sad_h5['target_cauchy_fit_loc']
            del chr_sad_h5['target_cauchy_fit_scale']

        # delete norm params
        if 'target_cauchy_norm_loc' in chr_sad_h5.keys():
            del chr_sad_h5['target_cauchy_norm_loc']
            del chr_sad_h5['target_cauchy_norm_scale']

        # count SNPs
        num_snps += chr_sad_h5['SAD'].shape[0]
        num_targets = chr_sad_h5['SAD'].shape[-1]

        chr_sad_h5.close()

    # sample SNPs across chromosomes
    sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets)

    # initialize fit parameters
    target_cauchy_fit_loc = np.zeros(num_targets)
    target_cauchy_fit_scale = np.zeros(num_targets)

    # fit parameters
    for ti in range(num_targets):
        print('Fitting t%d' % ti, flush=True)
        cp = cauchy.fit(sad[:, ti])
        target_cauchy_fit_loc[ti] = cp[0]
        target_cauchy_fit_scale[ti] = cp[1]
    del sad

    # write across chromosomes
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+')
        chr_sad_h5.create_dataset('target_cauchy_fit_loc',
                                  data=target_cauchy_fit_loc)
        chr_sad_h5.create_dataset('target_cauchy_fit_scale',
                                  data=target_cauchy_fit_scale)
        chr_sad_h5.close()

    # compute normalization parameters
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad5 = SAD5(chr_sad_h5_file)

    # QC fit table
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)
    fit_out = open('%s/fits.txt' % options.out_dir, 'w')
    for ti in range(num_targets):
        print('%-4d  %7.1e  %7.1e' %
              (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]),
              file=fit_out)
    fit_out.close()

    # QC quantiles
    quantile_dir = '%s/quantiles' % options.out_dir
    if not os.path.isdir(quantile_dir):
        os.mkdir(quantile_dir)
    sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets)
    for ti in np.linspace(0, num_targets - 1, 64, dtype='int'):
        # compute cauchy and argsort quantiles
        cauchy_q = cauchy.cdf(sad_qc[:, ti],
                              loc=target_cauchy_fit_loc[ti],
                              scale=target_cauchy_fit_scale[ti])
        sort_i = np.argsort(sad_qc[:, ti])

        quantile_pdf = '%s/t%d.pdf' % (quantile_dir, ti)

        jointplot(np.linspace(0, 1, len(sort_i)),
                  cauchy_q[sort_i],
                  quantile_pdf,
                  square=True,
                  cor=None,
                  x_label='Empirical',
                  y_label='Cauchy')

    # QC plots
    norm_dir = '%s/norm' % options.out_dir
    if not os.path.isdir(norm_dir):
        os.mkdir(norm_dir)
    chr_sad5 = SAD5(chr_sad_h5_files[0])
    qc_sample = 2048
    if qc_sample < chr_sad5.num_snps:
        ri = sorted(
            np.random.choice(np.arange(chr_sad5.num_snps),
                             size=qc_sample,
                             replace=False))
    else:
        ri = np.arange(chr_sad5.num_snps)
    qc_sad_raw = chr_sad5.sad_matrix[ri]
    qc_sad_norm = chr_sad5[ri]
    for ti in np.linspace(0, num_targets - 1, 32, dtype='int'):
        plt.figure()
        sns.jointplot(qc_sad_raw[:, ti],
                      qc_sad_norm[:, ti],
                      joint_kws={
                          'alpha': 0.5,
                          's': 10
                      })
        plt.savefig('%s/t%d.pdf' % (norm_dir, ti))
        plt.close()
Пример #7
0
def plot_histogram(filename, 
    column_names=[], skip_cols=[], nbins=10, trimends=False,
    autosave=False, save_directory='', save_format='svg', delimiter=None):
    """
    Plots a histogram formed from the columns of the specified file.

    If column_names is specified, the titles of the plots will be renamed
    accordingly.  Otherwise "Title" is inserted instead.

    skip_cols specifies any columns in the data that should be skipped.
    Columns at the end of the line may be skipped by using negative numbers.
    In this scheme the last column in a row is -1.
    """
    infile = open(filename, 'r')
    if(delimiter):
        data = loadtxt(infile, dtype=float, delimiter=',')
    else:
        data = loadtxt(infile, dtype=float)       
    infile.close()

    end_col = data.shape[1]
    
    norm_stats = list()
    cauchy_stats = list()

    # Reinterpret any negative numbers in skip_cols to be at the end of the line
    for column in range(0, len(skip_cols)):
        if skip_cols[column] < 0:
            skip_cols[column] = end_col + skip_cols[column]
       
    namecol = 0 
    for column in range(0, end_col):
        # Skip the column if instructed to do so:
        if(column in skip_cols):
            continue;

        # extract the data column:
        temp = data[:,column]
        
        if(trimends):
            minval = min(temp)
            maxval = max(temp)
            
            temp = filter(lambda x: x > minval, temp)
            temp = filter(lambda x: x < maxval, temp)
        
        # plot a histogram of the data:
        [n, bins, patches] = plt.hist(temp, bins=nbins, normed=True, label='Binned data')
        
        # fit a normal distribution:
        [norm_mu, norm_sigma] = norm.fit(temp)
        y = mlab.normpdf(bins, norm_mu, norm_sigma)
        legend_gauss = r'Normal: $\mu=%.3f,\ \sigma=%.3f$' % (norm_mu, norm_sigma)
        l = plt.plot(bins, y, 'r--', linewidth=2, label=legend_gauss)
        
        # fit a Lorentz/Cauchy distribution:
        # bug workaround for http://projects.scipy.org/scipy/ticket/1530
        # - specify a starting centroid value for the fit
        [cauchy_mu, cauchy_gamma] = cauchy.fit(temp, loc=norm_mu)
        y = cauchy.pdf(bins, loc=cauchy_mu, scale=cauchy_gamma)
        legend_cauchy = r'Cauchy: $\mu=%.3f,\ \gamma=%.3f$' % (cauchy_mu, cauchy_gamma)
        l = plt.plot(bins, y, 'g--', linewidth=2, label=legend_cauchy)
        
        # now setup the axes labels:
        try:
            title = column_names[namecol]
            namecol += 1
        except:
            title = "Title"
        
        plt.title(title)
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.legend(loc='best')
        
        if autosave:
            plt.savefig(save_directory + '/stats_hist_' + title + '.' + save_format, transparent=True, format=save_format)    
            plt.close()
        else:
            plt.show()
            
        # Add in the statistical information.
        norm_stats.append([title, norm_mu, norm_sigma])
        cauchy_stats.append([title, cauchy_mu, cauchy_gamma])


    # Now either print out or save the statistical information
    if(not autosave):
        print "Normal Statistics:"
        
    write_statistics(save_directory + '/stats_normal.txt', norm_stats, autosave)
    
    if(not autosave):
        print "Cauchy Statistics:"
        
    write_statistics(save_directory + '/stats_cauchy.txt', cauchy_stats, autosave)
Пример #8
0
                bins=np.linspace(-0.5, 0.5, 100),
                density=True)
    iqrs = np.percentile(strided['dspeed_rel'], [25, 75])
    axs[0].axvline(iqrs[0], color='k')
    axs[0].axvline(iqrs[1], color='k')

    axs[1].hist(strided['dhdg_rel'],
                bins=np.linspace(-3.0, 3.0, 100),
                density=True)
    iqrs = np.percentile(strided['dhdg_rel'], [25, 75])
    axs[1].axvline(iqrs[0], color='k')
    axs[1].axvline(iqrs[1], color='k')

    axs[0].set_title(f'Stride: {stride}')

    res = cauchy.fit(strided['dspeed_rel'])
    loc, scale = res
    samp = np.linspace(-0.5, 0.5, 100)
    pdf = cauchy.pdf(samp, *res)
    axs[0].plot(samp, pdf, color='tab:orange')
    axs[0].text(0.05,
                0.85,
                f"Cauchy(loc={loc:.3f}, scale={scale:.3f})",
                transform=axs[0].transAxes)

    res = cauchy.fit(strided['dhdg_rel'])
    loc, scale = res
    samp = np.linspace(-np.pi, np.pi, 100)
    pdf = cauchy.pdf(samp, *res)
    axs[1].plot(samp, pdf, color='tab:orange')
    axs[1].text(0.05,
Пример #9
0
def fit_cauchy(sad, ti):
    print('Fitting t%d' % ti)
    return cauchy.fit(sad[:, ti])
                wl_vec = np.linspace(lambda_0, lambda_N, (int)(3000))

                pix_vec = np.linspace(0, 2999, 3000)
                pix_vec = np.array(pix_vec, dtype=np.int64)

                ncomb = len(chosen_pix)
                ncomb = (int)(nwl * (nwl - 1) / 2)

                _, dToT = generate_Taverage_distribution(
                    T0, wl_vec, pix_vec, nwl)
                muTbar, sigTbar, ratio, muThat, sigThat = compute_high_order_variance(
                    T0, sigma_I, w)
                #                _ = plt.hist(dToT[(dToT<0.1)&(dToT>-0.1)],bins=100,normed=True,histtype='step')
                #                dToT = dToT[(dToT<0.1)&(dToT>-0.1)]

                loc, sca = cauchy.fit(dToT)
                x_d = np.linspace(-0.1, 0.1, 1000)
                plt.plot(x_d, cauchy.pdf(x_d, loc=loc, scale=sca), 'k-')

                mu = np.average(dToT)
                sig = np.std(dToT)
                skw = skew(dToT)
                krt = kurtosis(dToT, fisher=False)

                res.append([mu, sig, skw, krt])

res = np.array(res)
#plt.xlim([-0.1,0.1])

#avec = np.arange(20,200,20)
#for alpha in avec:
Пример #11
0
from scipy.stats import cauchy, norm  # noqa

residuals = gandalfs.zenith - primaries.zenith
cut = (gandalfs["lambda"] < l) & (np.abs(residuals) < 2 * np.pi)
residuals = residuals[cut]
event_info[cut]

# convert rad -> deg
residuals = residuals * 180 / np.pi

pi = 180
# x axis for plotting
x = np.linspace(-pi, pi, 1000)

c_loc, c_gamma = cauchy.fit(residuals)
fwhm = 2 * c_gamma

g_mu_bad, g_sigma_bad = norm.fit(residuals)
g_mu, g_sigma = norm.fit(residuals[np.abs(residuals) < 10])

plt.hist(residuals, bins="auto", label="Histogram", density=True, alpha=0.7)
plt.plot(
    x,
    cauchy(c_loc, c_gamma).pdf(x),
    label="Lorentz: FWHM $=${:.3f}".format(fwhm),
    linewidth=2,
)
plt.plot(
    x,
    norm(g_mu_bad, g_sigma_bad).pdf(x),
Пример #12
0
def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("-o", dest="out_dir", default="sad_norm")
    parser.add_option(
        "-s",
        dest="sample",
        default=100000,
        type="int",
        help="Number of SNPs to sample for fit [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide SAD HDF5 path")
    else:
        sad_h5_path = args[0]

    # retrieve chromosome SAD HDF5 files
    chr_sad_h5_files = sorted(glob.glob("%s/*/sad.h5" % sad_h5_path))
    assert len(chr_sad_h5_files) > 0

    # clean out any existing fits
    # count SNPs across chromosomes
    num_snps = 0
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+")

        # delete fit params
        if "target_cauchy_fit_loc" in chr_sad_h5.keys():
            del chr_sad_h5["target_cauchy_fit_loc"]
            del chr_sad_h5["target_cauchy_fit_scale"]

        # delete norm params
        if "target_cauchy_norm_loc" in chr_sad_h5.keys():
            del chr_sad_h5["target_cauchy_norm_loc"]
            del chr_sad_h5["target_cauchy_norm_scale"]

        # count SNPs
        num_snps += chr_sad_h5["SAD"].shape[0]
        num_targets = chr_sad_h5["SAD"].shape[-1]

        chr_sad_h5.close()

    # sample SNPs across chromosomes
    sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets)

    # initialize fit parameters
    target_cauchy_fit_loc = np.zeros(num_targets)
    target_cauchy_fit_scale = np.zeros(num_targets)

    # fit parameters
    for ti in range(num_targets):
        print("Fitting t%d" % ti, flush=True)
        cp = cauchy.fit(sad[:, ti])
        target_cauchy_fit_loc[ti] = cp[0]
        target_cauchy_fit_scale[ti] = cp[1]
    del sad

    # write across chromosomes
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+")
        chr_sad_h5.create_dataset("target_cauchy_fit_loc", data=target_cauchy_fit_loc)
        chr_sad_h5.create_dataset(
            "target_cauchy_fit_scale", data=target_cauchy_fit_scale
        )
        chr_sad_h5.close()

    # compute normalization parameters
    for chr_sad_h5_file in chr_sad_h5_files:
        chr_sad5 = SAD5(chr_sad_h5_file)

    # QC fit table
    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)
    fit_out = open("%s/fits.txt" % options.out_dir, "w")
    for ti in range(num_targets):
        print(
            "%-4d  %7.1e  %7.1e"
            % (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]),
            file=fit_out,
        )
    fit_out.close()

    # QC quantiles
    quantile_dir = "%s/quantiles" % options.out_dir
    if not os.path.isdir(quantile_dir):
        os.mkdir(quantile_dir)
    sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets)
    for ti in np.linspace(0, num_targets - 1, 64, dtype="int"):
        # compute cauchy and argsort quantiles
        cauchy_q = cauchy.cdf(
            sad_qc[:, ti],
            loc=target_cauchy_fit_loc[ti],
            scale=target_cauchy_fit_scale[ti],
        )
        sort_i = np.argsort(sad_qc[:, ti])

        quantile_pdf = "%s/t%d.pdf" % (quantile_dir, ti)

        jointplot(
            np.linspace(0, 1, len(sort_i)),
            cauchy_q[sort_i],
            quantile_pdf,
            square=True,
            cor=None,
            x_label="Empirical",
            y_label="Cauchy",
        )

    # QC plots
    norm_dir = "%s/norm" % options.out_dir
    if not os.path.isdir(norm_dir):
        os.mkdir(norm_dir)
    chr_sad5 = SAD5(chr_sad_h5_files[0])
    qc_sample = 2048
    if qc_sample < chr_sad5.num_snps:
        ri = sorted(
            np.random.choice(
                np.arange(chr_sad5.num_snps), size=qc_sample, replace=False
            )
        )
    else:
        ri = np.arange(chr_sad5.num_snps)
    qc_sad_raw = chr_sad5.sad_matrix[ri]
    qc_sad_norm = chr_sad5[ri]
    for ti in np.linspace(0, num_targets - 1, 32, dtype="int"):
        plt.figure()
        sns.jointplot(
            qc_sad_raw[:, ti], qc_sad_norm[:, ti], joint_kws={"alpha": 0.5, "s": 10}
        )
        plt.savefig("%s/t%d.pdf" % (norm_dir, ti))
        plt.close()
Пример #13
0
def plot_histogram(filename,
                   column_names=[],
                   skip_cols=[],
                   nbins=10,
                   trimends=False,
                   autosave=False,
                   save_directory='',
                   save_format='svg',
                   delimiter=None):
    """
    Plots a histogram formed from the columns of the specified file.

    If column_names is specified, the titles of the plots will be renamed
    accordingly.  Otherwise "Title" is inserted instead.

    skip_cols specifies any columns in the data that should be skipped.
    Columns at the end of the line may be skipped by using negative numbers.
    In this scheme the last column in a row is -1.
    """
    infile = open(filename, 'r')
    if (delimiter):
        data = loadtxt(infile, dtype=float, delimiter=',')
    else:
        data = loadtxt(infile, dtype=float)
    infile.close()

    end_col = data.shape[1]

    norm_stats = list()
    cauchy_stats = list()

    # Reinterpret any negative numbers in skip_cols to be at the end of the line
    for column in range(0, len(skip_cols)):
        if skip_cols[column] < 0:
            skip_cols[column] = end_col + skip_cols[column]

    namecol = 0
    for column in range(0, end_col):
        # Skip the column if instructed to do so:
        if (column in skip_cols):
            continue

        # extract the data column:
        temp = data[:, column]

        if (trimends):
            minval = min(temp)
            maxval = max(temp)

            temp = filter(lambda x: x > minval, temp)
            temp = filter(lambda x: x < maxval, temp)

        # plot a histogram of the data:
        [n, bins, patches] = plt.hist(temp,
                                      bins=nbins,
                                      normed=True,
                                      label='Binned data')

        # fit a normal distribution:
        [norm_mu, norm_sigma] = norm.fit(temp)
        y = mlab.normpdf(bins, norm_mu, norm_sigma)
        legend_gauss = r'Normal: $\mu=%.3f,\ \sigma=%.3f$' % (norm_mu,
                                                              norm_sigma)
        l = plt.plot(bins, y, 'r--', linewidth=2, label=legend_gauss)

        # fit a Lorentz/Cauchy distribution:
        # bug workaround for http://projects.scipy.org/scipy/ticket/1530
        # - specify a starting centroid value for the fit
        [cauchy_mu, cauchy_gamma] = cauchy.fit(temp, loc=norm_mu)
        y = cauchy.pdf(bins, loc=cauchy_mu, scale=cauchy_gamma)
        legend_cauchy = r'Cauchy: $\mu=%.3f,\ \gamma=%.3f$' % (cauchy_mu,
                                                               cauchy_gamma)
        l = plt.plot(bins, y, 'g--', linewidth=2, label=legend_cauchy)

        # now setup the axes labels:
        try:
            title = column_names[namecol]
            namecol += 1
        except:
            title = "Title"

        plt.title(title)
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.legend(loc='best')

        if autosave:
            plt.savefig(save_directory + '/stats_hist_' + title + '.' +
                        save_format,
                        transparent=True,
                        format=save_format)
            plt.close()
        else:
            plt.show()

        # Add in the statistical information.
        norm_stats.append([title, norm_mu, norm_sigma])
        cauchy_stats.append([title, cauchy_mu, cauchy_gamma])

    # Now either print out or save the statistical information
    if (not autosave):
        print "Normal Statistics:"

    write_statistics(save_directory + '/stats_normal.txt', norm_stats,
                     autosave)

    if (not autosave):
        print "Cauchy Statistics:"

    write_statistics(save_directory + '/stats_cauchy.txt', cauchy_stats,
                     autosave)
Пример #14
0
def assign_sex(sextable, Rx_init, soft='Cauchy'):
    """assign sex to samples using k-means clustering"""
    Rx = map(operator.itemgetter(6), sextable)
    centroid, hard_classification = kmeans2(Rx,
                                            np.array([0.5 * Rx_init, Rx_init]),
                                            minit='matrix')

    Rx_m = [Rx[i] for i, j in enumerate(hard_classification) if j == 0]
    Rx_f = [Rx[i] for i, j in enumerate(hard_classification) if j == 1]

    m_mu, m_std = centroid[0], np.std(Rx_m)
    f_mu, f_std = centroid[1], np.std(Rx_f)

    if soft is 'Normal':
        # Rx~Normal
        m_dist = norm(m_mu, m_std)
        f_dist = norm(f_mu, f_std)

    elif soft is 'Beta':
        # Rx~Beta - nasty estimation and looks like the normal anyway
        mMx = [Mx[i] for i, j in enumerate(hard_classification) if j == 0]
        mMa = [Ma[i] for i, j in enumerate(hard_classification) if j == 0]
        fMx = [Mx[i] for i, j in enumerate(hard_classification) if j == 1]
        fMa = [Ma[i] for i, j in enumerate(hard_classification) if j == 1]
        mloc, mscale = beta.fit_loc_scale(Rx_m, np.median(mMx), np.median(mMa))
        ma, mb, mloc, mscale = beta.fit(Rx_m, floc=mloc, fscale=mscale)
        #print(ma, mb, mloc, mscale)
        floc, fscale = beta.fit_loc_scale(Rx_f, np.median(fMx), np.median(fMa))
        fa, fb, floc, fscale = beta.fit(Rx_f, floc=floc, fscale=fscale)
        #print(fa, fb, floc, fscale)
        m_dist = beta(ma, mb, loc=mloc, scale=mscale)
        f_dist = beta(fa, fb, loc=floc, scale=fscale)

    elif soft is 'Cauchy':
        # Rx~Cauchy - assumption of independence for num/denom is violated, but otherwise seems sensible
        m_loc, m_scale = cauchy.fit(Rx_m)
        f_loc, f_scale = cauchy.fit(Rx_f)
        m_dist = cauchy(m_loc, m_scale)
        f_dist = cauchy(f_loc, f_scale)
        # use Cauchy central tendency
        m_mu = m_loc
        f_mu = f_loc

    m_bound = m_dist.ppf(0.95)
    f_bound = f_dist.ppf(0.05)
    m_int = m_dist.interval(0.99)
    f_int = f_dist.interval(0.99)
    m_int = (max(0.0, m_int[0]), min(1.0, m_int[1]))
    f_int = (max(0.0, f_int[0]), min(1.0, f_int[1]))

    soft_classification = []
    for x in Rx:
        if x < m_bound:
            soft_classification.append(0)
        elif x > f_bound:
            soft_classification.append(1)
        else:
            soft_classification.append(None)

    if True:
        zn = 100.0
        logit_pm = np.empty_like(Rx)
        cntrd = centroid
        for i, _ in enumerate(Rx):
            Rx_i = Rx[:i] + Rx[i + 1:]
            #cntrd, _ = kmeans2(Rx_i, np.array([0.5*Rx_init, Rx_init]), minit='matrix')
            x, a = sextable[i][1], sextable[i][3]
            p_m = 0
            p_f = 0
            for z in np.linspace(m_int[0], m_int[1], zn):
                p_m += binom.pmf(x, x + a, z) * (m_dist.cdf(z + 0.5 / zn) -
                                                 m_dist.cdf(z - 0.5 / zn))
            for z in np.linspace(f_int[0], f_int[1], zn):
                p_f += binom.pmf(x, x + a, z) * (f_dist.cdf(z + 0.5 / zn) -
                                                 f_dist.cdf(z - 0.5 / zn))
            #p_m /= zn
            #p_f /= zn
            if p_m == 0.0:
                logit_pm[i] = -math.log(p_f)
            elif p_f == 0.0:
                logit_pm[i] = math.log(p_m)
            else:
                logit_pm[i] = math.log(p_m) - math.log(p_f)
            #print(p_m, p_f, logit_pm[i])

            #p_m = binom.logsf(x-1, x+a, cntrd[0])
            #p_f = binom.logcdf(x-1, x+a, cntrd[1])
            #logit_pm[i] = p_m - p_f

    return hard_classification, soft_classification, m_dist, f_dist, m_mu, f_mu, m_bound, f_bound, logit_pm