def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000, alpha=1, output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, normed=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([ alpha, self.target_distribution(can) / self.target_distribution(x) ]) #acceptance probability u = pylab.uniform(0, 1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, normed=1) pylab.plot(x, y, 'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF', 'Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def run(self, bins=50, xmin=0, xmax=30000, step=1000, burn=1000,alpha=1,output_filename=None): # compute histogram of the input reads once for all to be used # in the target_distribution method self.bins = bins self.Y, self.X = np.histogram(self.bam.df.read_length, bins=bins, density=True) lengths = self.bam_simul.df.read_length.values self.tokeep = [] vec = [] x = self.bam.df.read_length.mean() for i in range(self.bam_simul.df.shape[0]): can = lengths[i] aprob = min([alpha,self.target_distribution(can)/self.target_distribution(x)]) #acceptance probability u = pylab.uniform(0,1) if u < aprob: x = can vec.append(x) self.tokeep.append(True) else: self.tokeep.append(False) #plotting the results: #theoretical curve x = pylab.arange(xmin, xmax, step) y = self.target_distribution(x) pylab.subplot(211) pylab.title('Metropolis-Hastings') pylab.plot(vec) pylab.subplot(212) pylab.hist(vec[burn:], bins=bins, density=1) pylab.plot(x,y,'r-') pylab.ylabel('Frequency') pylab.xlabel('x') pylab.legend(('PDF','Samples')) if output_filename is not None: self.bam_simul.filter_bool(output_filename, self.tokeep)
def plot_pca_vs_max_features(self, step=100, n_components=2, progress=True): """ .. plot:: :include-source: from sequana.viz.pca import PCA from sequana import sequana_data import pandas as pd data = sequana_data("test_pca.csv") df = pd.read_csv(data) df = df.set_index("Id") p = PCA(df) p.plot_pca_vs_max_features() """ assert n_components in [2,3,4] N = len(self.df) if step > N: step = N # We start with at least 5 features X = range(10, N, step) from easydev import Progress pb = Progress(len(X)) Y = [] for i, x in enumerate(X): res = self.plot(n_components=n_components, max_features=x, show_plot=False) Y.append(res) if progress: pb.animate(i+1) sub = n_components pylab.subplot(sub,1,1) pylab.plot(X, [y[0]*100 for y in Y]) pylab.ylabel("PC1 (%)") pylab.subplot(sub,1,2) pylab.plot(X, [y[1]*100 for y in Y]) pylab.ylabel("PC2 (%)") if sub >= 3: pylab.subplot(sub,1,3) pylab.plot(X, [y[2]*100 for y in Y]) pylab.ylabel("PC3 (%)") if sub >= 4: pylab.subplot(sub,1,4) pylab.plot(X, [y[3]*100 for y in Y]) pylab.ylabel("PC4 (%)")
def diagnostics(self, bins=60, clear=True): if clear: pylab.clf() pylab.subplot(3,1,1) pylab.hist(self.aprob, bins=bins) pylab.title("Acceptation") pylab.subplot(3,1,2) pylab.plot(self.vec) pylab.title("proposition") pylab.subplot(3,1,3) y, x, _ = pylab.hist(self.vec, bins, density=True, lw=0.5, ec="k") M1 = max(y) # this normalisation is an approximation/hack pylab.plot(self.Xtarget, self.Ytarget/ (max(self.Ytarget)/M1), "-ro") pylab.title("simulated (blue) and target (red) distributions")