def clean_outliers(points): boundaries_array = np.array(points) # slope, intercept, r_value, p_value, std_err print(boundaries_array) slope, intercept, _, _, _ = stats.linregress(boundaries_array) boundaries_array = remove_outliers(boundaries_array, slope, intercept) slope, intercept, _, _, _ = stats.linregress(boundaries_array) return remove_outliers(boundaries_array, slope, intercept)
def slop(bin,binwidth): outputlist = [["Bin", "\t", "Frequency", "\t", "Slope1", "\t", "Slope2", "\t", "peak-Width", "\t", "peak-Apex", "\t","intercept_mass", "\n"]] slope1 = [0] for index in range(0, len(bin) - 6): tempD=dict(itertools.islice(bin.items(), index,index + 7)) s, intercept, r, p, std_error = linregress(list(tempD.keys()), list(tempD.values())) slope1.append(s) slope2 = [] for index1 in range(0, len(bin) - 13): tempD=dict(itertools.islice(bin.items(), index1 + 3,index1 + 10)) #print(index1,len(tempD),len(slope1[index1 + 1:index1 + 8])) s1, intercept1, r1, p1, std_error1 = linregress(list(tempD.values()), slope1[index1 + 1:index1 + 8]) slope2.append(s1) apex = [] peak = [] interceptList = [0] if len(bin) % 2 == 0: minus1 = 6 minus2 = 3 else: minus1 = 7 minus2 = 3 for index3 in range(len(bin) - minus1): if slope1[index3] > 0.0 and slope1[index3 + 1] < 0.0: apex.append("1") else: apex.append("0") for index4 in range(len(bin) - 13): if slope2[index4] < 0: peak.append("1") else: peak.append("0") slope1 = [0] * 2 + slope1 + [0] * 3 slope2 = [0] * 6 + slope2 + [0] * 6 apex = [0] * 3 + apex + [0] * (len(bin) - (len(apex) + 3)) peak = [0] * 6 + peak + [0] * 6 for index6 in range(len(bin) - 6): if (abs(slope1[index6 + 1]) + abs(slope1[index6 + 2])) == 0.0: intercept_mass = float("inf") interceptList.append(intercept_mass) else: tempD=dict(itertools.islice(bin.items(), index6,index6)) intercept_mass = list(tempD.values()) + (float(binwidth) * abs(slope1[index6 + 1])) / ( abs(slope1[index6 + 1]) + abs(slope1[index6 + 2])) interceptList.append(intercept_mass) interceptList = interceptList + [0] * (len(bin) - len(interceptList)) plot_x = [] plot_y = [] for index5 in range(len(bin)-13): tempD=dict(itertools.islice(bin.items(), index5,index5)) outputlist.append([str(list(tempD.values())), "\t", str(slope1[index5]), "\t", str(slope1[index5]), "\t", str(slope2[index5]),"\t", str(peak[index5]), "\t", str(apex[index5]), "\t", str(interceptList[index5]), "\n"]) return outputlist
def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2 ** np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r),)) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def calculate_monthly_lapse_rates(csv, station_meta): mdf = read_csv(csv, sep=' ', infer_datetime_format=True, index_col=0, parse_dates=True) mdf = mdf.groupby(mdf.index.month).mean() with open(station_meta, 'r') as js: stations = json.load(js) tmin_lapse, tmax_lapse = [], [] for temp in ['tmin', 'tmax']: for month in range(1, 13): temps, elevations = [], [] cols = [c for c in mdf.columns if temp in c] d = mdf[cols] [ temps.append(d['{}_{}'.format(s, temp)].loc[month]) for s in stations.keys() ] [elevations.append(v['elev']) for k, v in stations.items()] regression = linregress(elevations, temps) if temp == 'tmin': tmin_lapse.append('{:.3f}'.format(regression.slope * 1000.)) else: tmax_lapse.append('{:.3f}'.format(regression.slope * 1000.)) print('tmax_lapse = {}'.format(', '.join(tmax_lapse))) print('tmin_lapse = {}'.format(', '.join(tmin_lapse))) print('station elevations') elevs = sorted([(v['zone'], v['elev']) for k, v in stations.items()], key=lambda x: x[0]) print(', '.join([str(x[1]) for x in elevs]))
def reuse_model_reg(X_test, y_test, wildcard_name, ws=os.getcwd(), save=True): misc_output_path = os.path.join(os.getcwd(), 'output_rs_learn', 'misc') if not os.path.exists(misc_output_path): os.makedirs(misc_output_path) prediction_list = [] feature_list = [] for tuned_model in glob.glob( os.path.join(ws, 'output_rs_learn', 'tuned_models', f'{wildcard_name}*.sav')): model_trained = joblib.load(tuned_model) model_name = os.path.basename(tuned_model)[:-4] prediction = model_trained.predict(X_test) prediction_list.append(prediction) feature_list.append(model_name) slope, intercept, r_value, p_value, std_err = stats.linregress( y_test, prediction) r2 = r2_score(y_test, prediction) rmse = sqrt(mean_squared_error(prediction, y_test)) percent_err = ((prediction - y_test) / y_test) * 100 mnb = np.mean(percent_err) print(f'{model_name} r: %.2f, r2: %.2f, rmse: %.2f, mnb: %.2f' % (r_value, r2, rmse, mnb)) df_prediction = pd.DataFrame(prediction_list).T df_prediction.columns = feature_list return df_prediction
def rest_task_regression(): for tpt in [tpt_cole, tpt_sh]: fig, axs = plt.subplots(2, 3, figsize=(16, 10), sharex="row", sharey="row") txt = None for li, (lib, name, lbl) in enumerate(lib_details): df = lib.gen_long_data(tpt) \ .groupby(["task", "region", "network"]).mean().reset_index() \ .convert_column(metric=lambda x: x * 1000) df_rest = df.and_filter(task="Rest") txt = [] for ti, task in enumerate(task_order(False)): dft = pd.merge(df_rest, df.and_filter(task=task), on=["region", "network"]) ax = axs[li, ti] sns.scatterplot(data=dft, x="metric_x", y=f"metric_y", hue="network", hue_order=tpt.net_order, ax=ax, palette=tpt.net_colors) slope, intercept, r_value, _, _ = stats.linregress(dft.metric_x, dft.metric_y) sns.lineplot(dft.metric_x, slope * dft.metric_x + intercept, ax=ax, color='black') ax.text(0.3, 0.8, f"$r^2$={r_value ** 2:.2f}***", ha='center', va='center', transform=ax.transAxes) ax.set(xlabel=f"Rest {lbl}", ylabel="") ax.get_legend().remove() txt.append(ax.text(-0.15 if ti == 0 else -0.05, 0.5, f"{task} {lbl}", transform=ax.transAxes, rotation=90, va='center', ha='center')) legend_handles = [] for net, color, label in zip(tpt.net_order, tpt.net_colors, tpt.net_labels(break_space=False)): legend_handles.append(Line2D([], [], color=color, marker='o', linestyle='None', markersize=5, label=label)) n_col = 6 if len(tpt.net_order) == 12 else 7 lgn = fig.legend(handles=legend_handles, loc=2, ncol=n_col, handletextpad=0.1, mode="expand", bbox_to_anchor=(0.12, -0.04, 0.785, 1)) print(savefig(fig, f"regression.{tpt}", extra_artists=txt + [lgn, ], low=False))
def calculate_histogram_sizes(tracks_queue, config, out_queue): params = config['tracking']['process'] df = DataFrame() sleep(5) while True: while not tracks_queue.empty() or tracks_queue.qsize() > 0: data = tracks_queue.get() df = df.append(data) if len(df) % 100 == 0: # t1 = tp.filter_stubs(df, params['min_traj_length']) # print(t1.head()) # t2 = t1[((t1['mass'] > params['min_mass']) & (t1['size'] < params['max_size']) & # (t1['ecc'] < params['max_ecc']))] # print(t2.head()) # t2 = t1 # d = tp.compute_drift(t1) # tm = tp.subtract_drift(t2.copy(), d) im = tp.imsd(df, config['tracking']['process']['um_pixel'], config['camera']['fps']) values = [] for pcle in im: data = im[pcle] slope, intercept, r, p, stderr = stats.linregress(np.log(data.index), np.log(data.values)) values.append([slope, intercept]) out_queue.put(values)
def fit_exponential_func(y, x): ## Fit with y = Ae^(Bx) -> logy = logA + Bx # Returns A and B of a function as: y = A*e^(Bx) B, logA, r_value, p_value, std_err = linregress(np.transpose(x.values), np.log(y)) return np.exp(logA), B
def find_consensus(unassigned, sample, is_vertical): """Attempt to find a set of measurements that forms a consensus, in the list of measurements. Args: unassigned (list): List of unassigned measurements. sample (list): Measurements that fit the extracted line. is_vertical (bool): Whether the landmark is close to being vertical. Returns: list: List of measurements that fit the line. """ cartesian_sample = numpy.array([point.location for point in sample]) cartesian_unassigned = numpy.array( [point.location for point in unassigned]) consensus = [] # If almost vertical, calculate line in terms of y. if is_vertical: cartesian_sample = numpy.fliplr(cartesian_sample) cartesian_unassigned = numpy.fliplr(cartesian_unassigned) # Calculate regression line. slope, intercept, r_, p_, e_ = stats.linregress(cartesian_sample[:, 0], cartesian_sample[:, 1]) # Find the unassigned points that match to this line. for i in range(len(unassigned)): # If the point lies close enough to the line. if util.point_line_dist(cartesian_unassigned[i], slope, intercept) < RANSAC_TOLERANCE: consensus.append(unassigned[i]) # Add it to the consensus points. return consensus
def plot_mean_boxplot_with_pearson(dataset_id): data = [] pearson = [] for i, technique_id in enumerate(technique_list): print(Globals.acronyms[technique_id], end=' ', flush=True) technique_pearson = [] technique_data = [] history = Parser.parse_rectangles(technique_id, dataset_id) for revision in range(len(history) - 1): delta_vis = DeltaMetrics.compute_delta_vis(history[revision], history[revision + 1]) delta_data = DeltaMetrics.compute_delta_data( history[revision], history[revision + 1]) un_mov = UnavoidableMovement.compute_unavoidable_movement( history[revision], history[revision + 1]) ratios = (1 - delta_vis) / (1 - delta_data) diffs = 1 - abs(delta_vis - delta_data) unavoidable = 1 - (delta_vis - un_mov) mean = (ratios + diffs + unavoidable) / 3 technique_data.append(mean) # Compute linear regression statistics _, _, r_value, _, _ = stats.linregress(delta_data, delta_vis) technique_pearson.append(r_value if r_value > 0 else 0) data.append(technique_data) pearson.append(technique_pearson) TimeBoxplot.plot_with_pearson(data, technique_list, pearson, title='Mean with Pearson - ' + dataset_id)
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]),list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val) return average_slope, t, p_val
def draw_fit(x, y): range = arg.regression x = x[:range] y = y[:range] slope, intercept, r_value, *_ = linregress(x, y) text = r'$f(x)={0:.3f}x+{1:.3f}, R^2={2:.3f}$'.format( slope, intercept, r_value**2) fit = [slope*i+intercept for i in x] plt.plot(x[:arg.regression], fit, 'k--') plt.annotate(text, xy=(x[-1], y[-2]))
def regress(my_dict): count = 0 x = [] y = [] for k, v, in my_dict.items(): x.append(count) count += 1 y.append(v) m, b, r, p, std_err = linregress(x, y) print("b = " + str(b) + ", m = " + str(m) + ", r^2 = " + str(r * r))
def scatter_plot(ssu_df, fg_df): ssu_iden, fg_iden, fg_siml = ssu_df['identity(%)'], fg_df['identity(%)'], fg_df['similarity(%)'] fig = plt.figure(figsize=(15,7),dpi=300) gs = gridspec.GridSpec(1,2,wspace=0.2,left=0.05, right=0.95) # correlation plot of 16S rRNA identity versus funtional gene identity ax0 = plt.subplot(gs[0]) plt.scatter(ssu_iden,fg_iden,color='blue',s=1) iden_func = stats.linregress(ssu_iden,fg_iden) x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1) y_rg = np.polyval([iden_func[0],iden_func[1]],x_rg) plt.text(5,95, r'$y = %.2f x %s $' % (iden_func[0],intercept(iden_func[1])), fontsize=15) plt.text(5,90, r'$R^2=%.4f$' % (iden_func[2]**2)) plt.text(5,85, r'$P-value=%.2e$' % (iden_func[3])) plt.text(5,80, r'$StdErr=%.4f$' % (iden_func[4])) plt.title('16S rRNA identity vs. Funtional gene identity') plt.plot(x_rg,y_rg,'r--',label='line 1') plt.xlabel('16S rRNA gene identity (%)') plt.ylabel('Funtional gene identity (%)') plt.ylim(0,100) plt.xlim(0,100) # correlation plot of 16S rRNA identity versus funtional gene similarity ax1 = plt.subplot(gs[1]) plt.scatter(ssu_iden,fg_siml,color='green',s=1) siml_func = stats.linregress(ssu_iden,fg_siml) x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1) y_rg = np.polyval([siml_func[0],siml_func[1]],x_rg) plt.text(5,95, r'$y = %.2f x %s $' % (siml_func[0],intercept(siml_func[1])), fontsize=15) plt.text(5,90, r'$R^2=%.4f$' % (siml_func[2]**2)) plt.text(5,85, r'$P-value=%.2e$' % (siml_func[3])) plt.text(5,80, r'$StdErr=%.4f$' % (siml_func[4])) plt.title('16S rRNA identity vs. Funtional gene similarity') (m,b) = np.polyfit(ssu_iden,fg_siml, 1) x_rg = range(int(min(ssu_iden)),int(max(ssu_iden))+1) y_rg = np.polyval([m,b],x_rg) plt.plot(x_rg,y_rg,'r--') plt.xlabel('16S rRNA gene identity (%)') plt.ylabel('Funtional gene similarity (%)') plt.ylim(0,100) plt.xlim(0,100) plt.savefig(o_dir+'/correlation_plot.pdf') return iden_func, siml_func
def _linear_regression(self): """ Final trend is expressed as a linear interpolation of the trend-signal obtained after the deseasonal processor of the input signal. :return: """ line = np.asarray(self.d).copy() # line = filter_outlier(np.asarray(temporal_series).copy(), nsigma=1) xx = np.arange(0, len(line), 1) slope, intercept, r_value, p_value, std_err = stats.linregress( xx[~np.isnan(line)], line[~np.isnan(line)]) return slope, intercept, p_value, np.square(r_value), std_err
def linregress(x_vals, y_vals): ''' least-squares regression of scipy ''' a_value, b_value, r_value, p_value, std_err = stats.linregress(x_vals,y_vals) est_yvals = a_value * pylab.array(x_vals) + b_value k = 1 / a_value # plot regression line print p_value, std_err pylab.plot(x_vals, est_yvals, label='Least-squares fit, k = ' + str(round(k)) + ", RSquare = " + str(r_value**2)) pylab.legend(loc='best')
def plot_regression(df, x, y, extra_names={}): '''Plot a regression with annotated statistics.''' # ugly hack to include origin in plot bounds plt.clf() ax = _do_plot(df, x, y) xlim, ylim = ax.get_xlim(), ax.get_ylim() ax.cla() ax.set_xlim(*xlim) ax.set_ylim(*ylim) _do_plot(df, x, y, ax=ax) # calculate some regression statistics... info = [ ("{} = " + ("{}" if isinstance(v, int) else "{:.2f}")).format(k, v) for k, v in it.chain( zip( [ 'Slope', 'Intercept', '$R^2$', '$p$', 'Standard Error', ], stats.linregress(df[x], df[y]), ), [('$n$', len(df))]) ] # ... and annotate regression statistics onto upper left at = AnchoredText( '\n'.join(info), frameon=True, loc='upper left', ) ax.add_artist(at) # save to file # and assert df['Load'] is homogeneous plt.savefig( kn.pack({ **{ 'x': slugify(x), 'y': slugify(y), 'synchronous': str(synchronous), 'ext': '.png', }, **extra_names }), transparent=True, dpi=300, )
def approximate_random_effects(data, labels, group): slope_per_donor = np.array([]) rval_per_donor = np.array([]) #print "Performing approximate random effect analysis..." for donor_id in set( data[group]): #for donor_id in donorids, perform linear regression #print "Total usable datapoints of donor %s: %d" % (donor_id, len(list(data[labels[0]][data[group] == donor_id]))) #shows usable datapoints per donor slope, _, rval, p_val, stderr = linregress( list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) slope_per_donor = np.append(slope_per_donor, slope) rval_per_donor = np.append(rval_per_donor, rval) #average_slope = round(slope_per_donor.mean(),6) #get mean r-value across donors #average_rval = round(rval_per_donor.mean(),6) #get mean r-value across donors average_slope = round(np.nanmean(slope_per_donor), 6) #get mean r-value across donors average_rval = round(np.nanmean(rval_per_donor), 6) #get mean r-value across donors t_value, p_value = ttest_1samp( slope_per_donor, 0) #t-test (redundant information for downstream analyses) with open(output_file, 'a') as f: #saving full data to .csv w = csv.writer(f) #print "Saving the analysis results..." w.writerow([ gene, average_rval, average_slope, rval_per_donor[0], rval_per_donor[1], rval_per_donor[2], rval_per_donor[3], rval_per_donor[4], rval_per_donor[5], t_value, p_value ]) with open(output_file_GSEA, 'a') as f: #saving GSEA input data to .csv w = csv.writer(f, delimiter='\t') #print "Saving to GSEA input file..." w.writerow([gene, average_rval]) #Scatterplot of gene expression against reverse inference fMRI map z-score print "Plotting the correlation graph..." ax = sns.lmplot(labels[0], labels[1], data, hue=group, legend=True, fit_reg=True) #comment-out for no plotting ax.set(xlabel="%s map z-score value" % (cog_function.capitalize())) ax = plot.title(gene) print "Saving the correlation graph..." plot.savefig(plot_pdf, format='pdf') plot.close() return
def get_taa_group_features(t, seeg, coords, tfr, tto): mask = (t > np.min(tfr)) * (t < max(np.max(tto), np.min(tfr) + 5.0)) seeg -= np.mean(seeg, axis=1)[:, None] pca = PCA(n_components=PCA_NCOMP) comps = pca.fit_transform(seeg.T) var_explained = pca.explained_variance_ratio_ duration = np.mean(tto - tfr) line_coords = np.linalg.norm(coords - coords[0], axis=1) slope, _, rval, _, _ = stats.linregress(line_coords, tfr) return duration, abs(slope), rval**2, var_explained[0], sum( var_explained[0:2])
def calibrate_data(in_data): """ Takes an input time series containing ccd (dependent variable, i.e. predictor), and gauge measurement (independent variable, i.e. predictand), and uses a linear model to derive the calibration parameters. :param in_data: input array with four columns of data: year, month, ccd, rain gauge measurement :return: calibration parameters """ # slice the array to select the gauge and ccd data gauge = in_data[:, 3] ccd = in_data[:, 2] # derive a linear model using the intrinsic function linregress, imported from the scipy package linear_model = linregress(ccd, gauge) a1 = linear_model[0] a0 = linear_model[1] # return a tuple containing the calibration parameters return a0, a1
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)" % (average_slope, t, p_val) sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"]) plt.ylabel("Linear regression slopes between %s and %s" % (labels[0], labels[1])) plt.axhline(0, color="red") sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3) plt.show() return average_slope, t, p_val
def capm(investment, market, risk_free_return=0): """Computes historical CAPM paramaters, using log returns, of the investment over the market. investment -- The daily prices of the investment under analysis. market -- The daily prices of the market investment. risk_free_return -- The risk-free return over the period of consideration, given as a fraction. Returns (alpha, beta, r), where r is the r-value.""" alr = log(1.0 + risk_free_return) investment_returns = [log(1.0 * b / a) - alr for (a, b) in zip(investment[0:-1], investment[1:])] market_returns = [log(1.0 * b / a) - alr for (a, b) in zip(market[0:-1], market[1:])] x = linregress(market_returns, investment_returns) beta = x[0] alpha = x[1] r = x[2] return (alpha, beta, r)
def comp_Z(se_data): ulist = np.unique(se_data[:,1]) max_points = 3 Z = [] for u in ulist: ui = np.where(se_data[:,1] == u) #find lowest available temperatures ii = np.argsort(se_data[ui][:,0]) d = se_data[ui][ii][-max_points:] # list of lowest temperatur for given U w0l = np.pi/d[:,0] # zero Matsubara Frequency dRSigma = d[:,2]/w0l # approximation for the derivative of SE at w=0 res = stats.linregress(1./np.array(d[:,0]), dRSigma) rr = unc.ufloat(res.intercept,res.stderr) rr = (1./(1.-rr)) Z.append([u,rr.n,rr.std_dev]) return np.array(Z)
def _test1(): np.random.seed(0) x = np.linspace(0., 10., 41) y1 = 2. - 1.5 * x # (2,-1) y2 = 2. * x - 5. # (3, 1) y3 = -x + 4. # (5, -1) y4 = 2. * x - 11. y = np.array(x) y[np.where(x < 2)] = y1[np.where(x < 2)] y[np.where((x >= 2) & (x < 3))] = y2[np.where((x >= 2) & (x < 3))] y[np.where((x >= 3) & (x < 5))] = y3[np.where((x >= 3) & (x < 5))] y[np.where(5 <= x)] = y4[np.where(5 <= x)] # plot(x, y, 'o') # show() n = len(x) var_x0 = np.var(x[:-1]) * (n - 1.) var_y0 = np.var(y[:-1]) * (n - 1.) mean_x = np.mean(x[:-1]) + (x[-1] - np.mean(x[:-1])) / n mean_y = np.mean(y[:-1]) + (y[-1] - np.mean(y[:-1])) / n dx = x[-1] - mean_x dy = y[-1] - mean_y _assert_eq(np.var(x) * n, _update_var(var_x0, n, dx)) _assert_eq(np.var(y) * n, _update_var(var_y0, n, dy)) beta0 = np.cov(x[:-1], y[:-1], bias=True)[0][1] / np.var(x[:-1]) slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) # print(slope) # print(np.cov(x, y, bias=True) [0][1] / np.var(x)) print('slope exact = {}, computed = {}'.format( slope, _update_beta(beta0, n, dx, dy, var_x0, np.var(x) * n))) print('intercept exact = {}, computed = {}'.format(intercept, mean_y - slope * mean_x)) segs = seg_lin_reg(x, y, 0.0001) assert len(segs) == 4 _assert_eq(segs[0][1], 2.), _assert_eq(segs[0][2], -1.5) _assert_eq(segs[1][1], -5.), _assert_eq(segs[1][2], 2.) _assert_eq(segs[2][1], 4.), _assert_eq(segs[2][2], -1.) _assert_eq(segs[3][1], -11.), _assert_eq(segs[3][2], 2.) plot_segments(x, y, 0.0001) # test spikes y[17] = 2 y[7] = 0 y[-1] = 7 y[-2] = 6 plot_segments(x, y, 0.0001)
def analyze(self, gaps: Sequence, mlc: MLC, y_field_size: float = 100, profile_width=10): """Analyze an EPID image with varying MLC overlaps to determine the DLG. Parameters ---------- gaps The gaps (i.e. overlap) of the leaves in mm. These should typically be in descending order and also be negative. E.g. (-1, ..., -2.2). mlc The MLC type/arrangement. This lets us know where the leaf centers are to take a profile along. y_field_size The field size along the y-dimension (perpendicular to the leaf travel). This will determined which leaves are associated with which gap. profile_width The width of the profile to take along the axes parallel to leaf motion. This should be a good bit wider than the gap values. The default is reasonable and it is unlikely it needs tweaking. """ measured_dlg_per_leaf = [] planned_dlg_per_leaf = [] mlc = mlc.value['arrangement'] g = list(gaps) g.sort() profile_width_px = round(self.image.dpmm * profile_width) mid_width = self.image.shape[1] / 2 mid_height = self.image.shape[0] / 2 for idx, center in enumerate(mlc.centers): if -y_field_size / 2 < center < y_field_size / 2: # get the pixel window area center_px = center * self.image.dpmm width_px = mlc.widths[idx] / 4 * self.image.dpmm top = ceil(mid_height + center_px + width_px) bottom = floor(mid_height + center_px - width_px) # sample the window and take the average perpendicular to MLC motion window = self.image[bottom:top, int(mid_width - profile_width_px):int(mid_width + profile_width_px)] width = self._determine_measured_gap(window.mean(axis=0)) planned_dlg_per_leaf.append(self._get_dlg_offset(y_field_size, center, g)) measured_dlg_per_leaf.append(width) # fit the data to a line and determine the DLG from the 0 intercept lin_fit = stats.linregress(planned_dlg_per_leaf, measured_dlg_per_leaf) dlg = lin_fit.intercept / lin_fit.slope self._lin_fit = lin_fit self.measured_dlg = dlg self.planned_dlg_per_leaf = planned_dlg_per_leaf self.measured_dlg_per_leaf = measured_dlg_per_leaf
def approximate_random_effects(data, labels, group): correlation_per_donor = {} for donor_id in set(data[group]): correlation_per_donor[donor_id], _, _, _, _ = linregress(list(data[labels[0]][data[group] == donor_id]), list(data[labels[1]][data[group] == donor_id])) average_slope = np.array(correlation_per_donor.values()).mean() t, p_val = ttest_1samp(correlation_per_donor.values(), 0) print "Averaged slope across donors = %g (t=%g, p=%g)"%(average_slope, t, p_val) sns.violinplot([correlation_per_donor.values()], inner="points", names=["donors"]) plt.ylabel("Linear regression slopes between %s and %s"%(labels[0],labels[1])) plt.axhline(0, color="red") sns.lmplot(labels[0], labels[1], data, hue=group, col=group, col_wrap=3) plt.show() return average_slope, t, p_val
def fit_exp_f(y, x): """ Returns parameters A and B that would fit an exponential function of y = A*e^(Bx) Parameters ---------- y: pd.Series Variable y in the formula x: pd.Series Variable x in the formula Returns ------- Parameters A and B """ ## Fit with y = Ae^(Bx) -> logy = logA + Bx # Returns A and B of a function as: y = A*e^(Bx) B, logA, r_value, p_value, std_err = linregress(transpose(x.values), log(y)) return exp(logA), B
def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2**np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r), )) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled( statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue( c <= 1.96 * np.linalg.linalg.norm( np.sum(p * np.arange(p.shape[1])**2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue( abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def calculate_histogram(self): self.calculating_histograms = True locations = self.locations.copy() t1 = tp.filter_stubs(locations, self.config['process']['min_traj_length']) # t2 = t1[((t1['mass'] > self.config['process']['min_mass']) & (t1['size'] < self.config['process']['max_size']) & # (t1['ecc'] < self.config['process']['max_ecc']))] im = tp.imsd(t1, self.config['process']['um_pixel'], self.config['process']['fps']) self.histogram_values = [] for pcle in im: if general_stop_event.is_set(): break data = im[pcle] t = data.index[~np.isnan(data.values)] val = data.values[~np.isnan(data.values)] try: slope, intercept, r, p, stderr = stats.linregress(np.log(t), np.log(val)) self.histogram_values.append([slope, intercept]) except: pass self.calculating_histograms = False self.publisher.publish('histogram', self.histogram_values)
def recalculate_line(consensus, is_vertical): """Given a discovered consensus, recalculate the line with other points that are close enough. Args: consensus (list): List of consensus measurements. is_vertical (bool): Whether the line is almost vertical. Returns: tuple: Start and end points of line segment. """ cartesian_consensus = numpy.array([point.location for point in consensus]) # If almost vertical, calculate line in terms of y. if is_vertical: cartesian_consensus = numpy.fliplr(cartesian_consensus) # Calculate regression line. slope, intercept, r_, p_, e_ = stats.linregress(cartesian_consensus[:, 0], cartesian_consensus[:, 1]) start = util.nearest(cartesian_consensus[0], slope, intercept) end = util.nearest(cartesian_consensus[0], slope, intercept) distance = 0 for i in range(len(consensus)): for j in range(i + 1, len(consensus)): point_a = util.nearest(cartesian_consensus[i], slope, intercept) point_b = util.nearest(cartesian_consensus[j], slope, intercept) new_dist = util.dist(point_a, point_b) if new_dist > distance: distance = new_dist start = point_a end = point_b # If line is vertical, flip coordinates back. if is_vertical: start = numpy.flipud(start) end = numpy.flipud(end) return start, end
def get_slopes(symbol_list, fund_type): # For each fund, I perform a simple least-squares linear regression # to get the value as a function of time. # Here, I also restrict the analysis to only those funds which have # gained value over the past five years (i.e. have a positive slope). # The logic behind this is that, if we're only adding one fund to the # portfolio, we can limit ourselves to choosing one that has # historically done well. The question is then whether the US bonds # that have done well have done better than the emerging market funds # that have done well. slopes = [] for symbol in symbol_list: slope = stats.linregress(parse_csv(symbol, fund_type))[0] if slope > 0.0: slopes.append(slope) # print(len(slopes)) return slopes
def decompose(_data, _plots = False): ''' Function to decompose a signal into it's trend and normal variation Input: _data: signal to decompose _plots: print plots or not (default False) Output: DataDecomp = _data - slope*_data.index slope, intercept = linear regression coefficients ''' indexDecomp = np.arange(len(_data)) slope, intercept, r_value, p_value, std_err = linregress(indexDecomp, np.transpose(_data.values)) dataDecomp=pd.DataFrame(index = _data.index) name = _data.name result = [] for n in range(len(_data)): result.append(float(_data.values[n]-slope*n)) dataDecomp[(name + '_' + '_flat')] = result trend = slope*indexDecomp + intercept if _plots == True: with plt.style.context('seaborn-white'): fig, ax = plt.subplots(figsize=(20,10)) ax.plot(_data.index, _data.values, label = "Actual", marker = None) ax.plot(_data.index, dataDecomp[(name + '_' +'_flat')], marker = None, label = 'Flattened') ax.plot(_data.index, trend, label = 'Trend') ax.legend(loc="best") ax.axis('tight') ax.set_title("Signal Decomposition - "+ name) ax.set_xlabel('Index') ax.set_ylabel('Signal') ax.grid(True) plt.show() return dataDecomp, slope, intercept
def calculate_histogram(self): """ Starts a new thread to calculate the histogram of fit-parameters based on the mean-squared displacement of individual particles. It publishes the data on topic `histogram`. .. warning:: This method is incredibly expensive. Since it runs on a thread it can block other pieces of code, especially the GUI, which runs on the same process. .. TODO:: The histogram loops over all the particles. It would be better to skeep particles for which there is no new data .. TODO:: Make this method able to run on a separate process. So far is not possible because it relies on data stored on the class itself (`self.locations`). """ self.calculating_histograms = True locations = self.locations.copy() t1 = tp.filter_stubs(locations, self.config['process']['min_traj_length']) t2 = t1[((t1['mass'] > self.config['process']['min_mass']) & (t1['size'] < self.config['process']['max_size']) & (t1['ecc'] < self.config['process']['max_ecc']))] im = tp.imsd(t2, self.config['process']['um_pixel'], self.config['process']['fps']) self.histogram_values = [] for pcle in im: if general_stop_event.is_set(): break data = im[pcle] t = data.index[~np.isnan(data.values)] val = data.values[~np.isnan(data.values)] try: slope, intercept, r, p, stderr = stats.linregress( np.log(t), np.log(val)) self.histogram_values.append([slope, intercept]) except: pass self.calculating_histograms = False self.publisher.publish('histogram', self.histogram_values)
def linregress_hb_drop_with_time_to_previous_rbc(self, si, threshold=np.inf): """ Compute a linear regression of each RBCs hemoglobin saturation drop with the time difference to the previous RBC. Args: si (int): segment index threshold (float): maximum value of time difference used for the linear regression Returns: float tuple, return value of scipy.stats.linregress """ time_difference = self.rbcDataPostProcessor.timeToPreviousRBC( si, self.n_rbc_average(si)) hb_drop = self.hb_difference(si)[1:] filtered_times = time_difference[time_difference < threshold] filtered_drops = hb_drop[time_difference < threshold] if filtered_times.size: return linregress(filtered_times, filtered_drops) else: return np.nan, np.nan, np.nan, np.nan
def getLinearRegressionData(self, log): values = self.createValuesForRegression(log) slope, intercept, r_value, p_value, std_err = linregress(values[0], values[1]) return [slope,intercept,r_value*r_value]
def fit( x, y, funchandle='gauss1', estimates=None ): """ Returns: fitstruct, fitY, Rbest """ from scipy.optimize import curve_fit from scipy.stats.stats import linregress if funchandle == 'gauss1': def fitfunc( x, a1, b1, c1 ): return a1 * np.exp( -( (x-b1)/ c1)**2 ) # Really arbitrary c1 estimate at basically 25 pixels.. if estimates is None: estimates = np.array( [np.max(y), x[np.argmax(y)], 25.0*(x[1]-x[0]) ] ) elif funchandle == 'poly1': def fitfunc( x, a1, b1 ): return a1 * x + b1 if estimates is None: slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x)) intercept = np.min(y) - slope*x[np.argmin(y)] estimates = [slope, intercept] elif funchandle == 'poly2': def fitfunc( x, a1, b1, c1 ): return a1 * x **2.0 + b1 *x + c1 if estimates is None: slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x)) intercept = np.min(y) - slope*x[np.argmin(y)] estimates = [0.0, slope, intercept] elif funchandle == 'poly3': def fitfunc( x, a1, b1, c1, d1 ): return a1 * x **3.0 + b1 *x**2.0 + c1*x + d1 if estimates is None: slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x)) intercept = np.min(y) - slope*x[np.argmin(y)] estimates = [0.0, 0.0, slope, intercept] elif funchandle == 'poly5': def fitfunc( x, a1, b1, c1, d1, e1, f1 ): return a1 * x **5.0 + b1 *x**4.0 + c1*x**3.0 + d1*x**2.0 + e1*x + f1 if estimates is None: slope = (np.max(y)-np.min(y))/(np.max(x)-np.min(x)) intercept = np.min(y) - slope*x[np.argmin(y)] estimates = [0.0, 0.0, 0.0, 0.0, slope, intercept] elif funchandle == 'abs1': def fitfunc( x, a1 ): return a1 * np.abs( x ) if estimates is None: estimates = np.array( [ (np.max(y)-np.min(y))/(np.max(x)-np.min(x))]) elif funchandle == 'exp': def fitfunc( x, a1, c1 ): return a1 * np.exp( c1*x ) if estimates is None: estimates = np.array( [1.0, -1.0] ) elif funchandle == 'expc': def fitfunc( x, a1, c1, d1 ): return a1 * np.exp( c1*x ) + d1 if estimates is None: estimates = np.array( [1.0, -1.0, 1.0] ) elif funchandle == 'power1': def fitfunc( x, a1, b1 ): return a1*(x**b1) if estimates is None: estimates = np.array( [1.0, -2.0] ) elif funchandle == 'power2': def fitfunc( x, a1, b1, c1 ): return a1*(x**b1) + c1 if estimates is None: estimates = np.array( [1.0, -2.0, 1.0] ) elif funchandle == 'powerpoly1': def fitfunc( x, a1, b1, a2, c1 ): return a1*(x**b1) + a2*x + c1 if estimates == None: estimates = np.array( [1.0, -2.0, 0.0, 1.0] ) else: fitfunc = funchandle try: fitstruct, pcov = curve_fit( fitfunc, x, y, p0=estimates ) perr = np.sqrt(np.diag(pcov)) print( "Fitting completed with perr = " + str(perr) ) fitY = fitfunc( x, *fitstruct ) goodstruct = linregress( x, fitfunc( x, *fitstruct ) ) Rbest = goodstruct[2] except RuntimeError: print( "RAM: Curve fitting failed") return return fitstruct, fitY, Rbest
# Remove Visual count 0 class #dfCS2 = dfCS[dfCS['JNM']!=0] #dfCS2 = dfCS1 # ----------------------------------------------- #### see zero counts Camille with large count software dfCS0 = dfCS[dfCS['JNM']==0] #### Query 0 count for me and count for C dfCSM0 = dfCS2[dfCS2['SoftC']==0] # -------------------------------------------------- # Create lists for corellation camC1 = list(dfCS3['J']) objc = list(dfCS3['SoftC']) slopeO, interceptO, r_valueO, p_valueO, std_errO = linregress(camC1,objc) print "r squared count = ",r_valueO**2 r_valueO = r_valueO**2 print "slope",slopeO print "p-value",p_valueO # plot raw corellation plt.scatter(camC1,objc) #plt.title('Obj count: erode = %s, dilate = %s, thres = %s'%(er,dil,thres)) plt.xlabel('Visual count') plt.ylabel('Software count') #pylab.savefig(resultsdir + 'ObjCount-' + str(count) + '.pdf',bbox_inches='tight') plt.show() # plot mean with sd
def test_regression_of_returns_factor(self, returns_length, regression_length): """ Tests for the built-in factor `RollingLinearRegressionOfReturns`. """ my_asset_column = 0 start_date_index = 6 end_date_index = 10 assets = self.asset_finder.retrieve_all(self.sids) my_asset = assets[my_asset_column] my_asset_filter = AssetID() != (my_asset_column + 1) num_days = end_date_index - start_date_index + 1 # The order of these is meant to align with the output of `linregress`. outputs = ["beta", "alpha", "r_value", "p_value", "stderr"] # Our regression factor requires that its target asset is not filtered # out, so make sure that masking out our target asset does not take # effect. That is, a filter which filters out only our target asset # should produce the same result as if no mask was passed at all. for mask in (NotSpecified, my_asset_filter): regression_factor = RollingLinearRegressionOfReturns( target=my_asset, returns_length=returns_length, regression_length=regression_length, mask=mask ) results = self.engine.run_pipeline( Pipeline(columns={output: getattr(regression_factor, output) for output in outputs}), self.dates[start_date_index], self.dates[end_date_index], ) output_results = {} expected_output_results = {} for output in outputs: output_results[output] = results[output].unstack() expected_output_results[output] = full_like(output_results[output], nan) # Run a separate pipeline that calculates returns starting 2 days # prior to our start date. This is because we need # (regression_length - 1) extra days of returns to compute our # expected regressions. returns = Returns(window_length=returns_length) results = self.engine.run_pipeline( Pipeline(columns={"returns": returns}), self.dates[start_date_index - (regression_length - 1)], self.dates[end_date_index], ) returns_results = results["returns"].unstack() # On each day, calculate the expected regression results for Y ~ X # where Y is the asset we are interested in and X is each other # asset. Each regression is calculated over `regression_length` # days of data. for day in range(num_days): todays_returns = returns_results.iloc[day : day + regression_length] my_asset_returns = todays_returns.iloc[:, my_asset_column] for asset, other_asset_returns in todays_returns.iteritems(): asset_column = int(asset) - 1 expected_regression_results = linregress(y=other_asset_returns, x=my_asset_returns) for i, output in enumerate(outputs): expected_output_results[output][day, asset_column] = expected_regression_results[i] for output in outputs: assert_frame_equal( output_results[output], DataFrame( expected_output_results[output], index=self.dates[start_date_index : end_date_index + 1], columns=assets, ), )
meanGroundedTime = [] for robot in dataset[0]: robotGroundedTimesteps = [] for row in robot: rowGrounded = 1 if 1 in row else 0; robotGroundedTimesteps.append(rowGrounded); meanGroundedTime.append(np.mean(robotGroundedTimesteps)); print(np.mean(meanGroundedTime), np.std(meanGroundedTime), min(meanGroundedTime), max(meanGroundedTime)); rp = pearsonr(meanGroundedTime, dataset[REWARD_SIGNALS]); print(rp); rs = spearmanr(meanGroundedTime, dataset[REWARD_SIGNALS]); print(rs); lr = linregress(meanGroundedTime, dataset[REWARD_SIGNALS]); print(lr); fit = np.polyfit(meanGroundedTime, dataset[REWARD_SIGNALS], 1); print(fit); fitfn = np.poly1d(lr[0:2]); plt.plot(meanGroundedTime, dataset[REWARD_SIGNALS], 'go', np.arange(.4, 1.1, .01), fitfn(np.arange(.4, 1.1, .01)), '--k'); plt.title("Proportion Time Grounded and Normalized Reinforcement Signal" + "\n For " + robotType[0].upper() + robotType[1:] + " Robot with \"jump\" Command"); plt.ylabel("Normalized Reinforcement Signal"); plt.xlabel("Proportion of Time Grounded"); plt.axis([.45, 1.05, -1.1, 1.1]); plt.show();
model_sum.append(model_hour_sum) validation_sum.append(validation_hour_sum) model_dump.append(np.nansum(model_sum)) validation_dump.append(np.nansum(validation_sum)) err = np.abs(np.array([validation_dump]) - np.array([model_dump])) err_av = np.mean(err) sum1 = np.sum(model_dump) print('Difference between valid and model =', err) print('Average daily error =', err_av) print('Sum of model LWD =', sum1) print('Sum of valid LWD =', np.sum(validation_dump)) slope, intercept, r_value, p_value, std_err = \ linregress(model_dump,validation_dump) print('slope =', slope) print('intercept =', intercept) counter1 = np.zeros_like(model_dump) counter2 = np.zeros_like(model_dump) counter5 = np.zeros_like(model_dump) # Counters for number of model LWD that are between 1, 2 and 5 hours of # validation LWD for i in range(len(model_dump)): if (model_dump[i]<=validation_dump[i]+1 and \ model_dump[i]>=validation_dump[i]-1): counter1[i] = 1 if (model_dump[i]<=validation_dump[i]+2 and \
len(histDic1) plt.scatter(histDic1.keys(),histDic1.values()) plt.hist(histDic1.values()) histDic2=[np.count_nonzero((binwidth-tol < testBin) & (testBin < binwidth+tol)) for binwidth in range(100)] len(histDic2) histDicT=bInt(binSs,0.02) len(histDicT) plt.hist(histDicT.values(),bins=sample)#np.arange(min(dataD), max(dataD) + binwidth, binwidth)) from scipy.stats.stats import linregress for value in binSs[:6]: print(value) for index in range(0, len(binSs[:sample]) - 6): s, intercept, r, p, std_error = linregress(binSs[index:index + 7], binSs[index:index + 7]) print(s, intercept, r, p, std_error,"\n") for value in histDicT: temp = histDicT[value] print(value,temp)#,histDicT[0],histDicT[1],histDicT[2]) histDicT=histDic1 histDicT=bInt(binS,0.02) import itertools slope1=[] slope1pos=[] for index in range(0, len(histDicT) - 6): tempD=dict(itertools.islice(histDicT.items(), index,index + 7)) #print(list(tempD.values())) s, intercept, r, p, std_error = linregress(list(tempD.keys()), list(tempD.values())) #print(index,tempD,s, intercept, r, p, std_error) slope1.append(s)
""" roi_data_mean = np.ones(len(names))*-99 roi_data_std = np.ones(len(names))*-99 roi_data_r = np.ones(len(names))*-99 roi_data_p = np.ones(len(names))*-99 roi_data_m = np.ones(len(names))*-99 for i, name in enumerate(names): #wm_name = 'wm-' + hemi + '-' + name wm_name = '{}_{}'.format(hemi, name) if wm_name in df1.columns: df_merge = df1.merge(df2, on='nspn_id') roi_data_mean[i] = df1[wm_name].mean() roi_data_std[i] = df1[wm_name].std() m, c, r, p, sterr = linregress(df_merge[wm_name + '_x'], df_merge[wm_name + '_y']) roi_data_m[i] = m roi_data_r[i] = r roi_data_p[i] = 1 - p """ Make a vector containing the data point at each vertex. """ vtx_data_mean = roi_data_mean[labels] vtx_data_std = roi_data_std[labels] vtx_data_r = roi_data_r[labels] vtx_data_p = roi_data_p[labels] vtx_data_m = roi_data_m[labels] """
d2 = hpcs.double[t2,t1] naive = s1['instructions'] + s2['instructions'] actual = d1['instructions'] + d2['instructions'] degr = actual / naive degradations += [degr] for k, v1 in s1.items(): v2 = s2[k] total = v1 + v2 counters[k] += [total] # total = gettotal(shpc1, shpc2, ['LLC-stores', 'LLC-loads']) total = gettotal(s1, s2, ['instructions']) plotdata[total] = degr for counter,v in counters.items(): cor, pv = pearsonr(v, degradations) if pv < 0.1: print ("{:25} {: .3f} {:2.1%}".format(counter, cor, pv*100)) if plotdata: X = sorted(list(plotdata.keys())) Y = [plotdata[x] for x in X] print(linregress(X,Y)) p.xlabel("counters") p.ylabel("degradatation") p.plot(X, Y, '-o') p.show()
def main(): usage = 'usage: %prog [opt] lfq_filename gene_exprs_filename output_filename'\ '\nThree arguments must be specified in command line:\n'\ '1) LFQ filename, containing LFQ intensities and two replicates.\n'\ '2) Gene exprs filename, read count data.\n'\ '3) AS status of genes (miso output)\n' parser = OptionParser(usage=usage) # colnames for lfq data parser.add_option('--lfq_gene_colname', dest='lfq_gene_colname', default='Gene names', help='Column name of gene name') parser.add_option('--samp1_lfq_colname1', dest='samp1_lfq_colname1', default='LFQ intensity T331_1', help='Column name of LFQ intensity, sample 1 replicate 1.') parser.add_option('--samp1_lfq_colname2', dest='sampl1_lfq_colname2', default='LFQ intensity T331_2', help='Column name of LFQ intensity, sample 1 replicate 2.') parser.add_option('--samp2_lfq_colname1', dest='sampl2_lfq_colname1', default='LFQ intensity R_1', help='Column name of LFQ intensity, sample 2 replicate 1.') parser.add_option('--samp2_lfq_colname2', dest='sampl2_lfq_colname2', default='LFQ intensity R_2', help='Column name of LFQ intensity, sample 2 replicate 2.') # colnames for gene exprs data parser.add_option('--mrna_gene_colname', dest='mrna_gene_colname', default='gene_name', help='Column name for mRNA exprs data.') parser.add_option('--samp1_exprs_colname', dest='samp1_exprs_colname', default='LTL331', help='Column name of gene exprs for sample 1') parser.add_option('--samp2_exprs_colname', dest='samp2_exprs_colname', default='LTL331_R', help='Column name of gene exprs for sample 2') parser.add_option('--spliced_only', dest='spliced_only', default='False', help='True or False. True shows only spliced genes. '\ 'False shows all. Default is False.') parser.add_option('--convert_to_log2', dest='convert_to_log2', default='True', help='True or False, converts mRNA exprs data to to log2'\ ' scale. Default True.') parser.add_option('--title', dest='title', default='Plot title', help='Title of plot.') parser.add_option('--xlabel', dest='xlabel', default='x axis', help='X axis label of plot') parser.add_option('--ylabel', dest='ylabel', default='y axis', help='Y axis label of plot') parser.add_option('--annotate_genes', dest='annotate_genes', default=None, help='CSV list of genes to be annotated.\n'\ 'Default is None, allowing mouse click annotation.') (options, args) = parser.parse_args() if len(args) < 3: print 'Not enough args specified.\n%s' %usage sys.exit() lfq_filename = args[0] gene_exprs_filename = args[1] miso_filename = args[2] # parse options # splicing only option spliced_only = options.spliced_only if spliced_only in ['True', 'true', 'T', 'TRUE']: spliced_only = True elif spliced_only in ['False', 'false', 'F', 'FALSE']: spliced_only = False else: print 'Spliced only option must be True or False. %s found.' \ %spliced_only sys.exit() print 'splicing_only: %s' %spliced_only # log2 conversion option convert_to_log2 = options.convert_to_log2 if convert_to_log2 in ['True', 'T']: convert_to_log2 = True elif convert_to_log2 in ['False', 'F']: convert_to_log2 = False else: print '--convert_to_log2 must be True or False. %s found.'\ %convert_to_log2 print 'Convert to log2: %s' %convert_to_log2 # xlabel, ylabel, title options xlabel = options.xlabel ylabel = options.ylabel title = options.title # annotate genes options if options.annotate_genes is not None: annotated_gene_list = options.annotate_genes.split(',') else: annotated_gene_list = options.annotate_genes lfq_mrna_dic = {} # Add LFQ information to dic lfq_mrna_dic = index_lfq_data(lfq_filename, lfq_mrna_dic, options, filter_out_missing_data=True) print 'lfq data indexed from file: %s' %lfq_filename # Add gene exprs to dic lfq_mrna_dic = index_mrna_data(gene_exprs_filename, lfq_mrna_dic, options, filter_na=True, convert_to_log2=convert_to_log2) print 'mrna data indexed from file: %s' %gene_exprs_filename # Write dic to file # write_lfq_mrna_data_to_file(lfq_mrna_dic, out_filename, options) # Get differentially spliced genes (non-redundant only) spliced_genes = list(set(get_spliced_genes(miso_filename))) print '%s spliced genes extracted from %s' %(len(spliced_genes), miso_filename) # Calculate Pearson and Spearman correlation for non-AS genes and AS genes # Create x and y vectors for spliced, nonspliced and both spliced_mrna_log2_fc, spliced_lfq_diff = \ split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=True) non_spliced_mrna_log2_fc, non_spliced_lfq_diff = \ split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=False) mrna_log2_fc, lfq_diff = \ split_by_splice_status(lfq_mrna_dic, spliced_genes, spliced=None) # Calculate r and pvals for Pearson for mrna_diff_vector, \ lfq_diff_vector, \ splice_status in \ zip([spliced_mrna_log2_fc, non_spliced_mrna_log2_fc, mrna_log2_fc], [spliced_lfq_diff, non_spliced_lfq_diff, lfq_diff], ['DS Genes', 'Non-DS Genes', 'All Genes']): pearsonr, pearsonpval = \ stats.pearsonr(mrna_diff_vector, lfq_diff_vector) print 'Gene set:%s\nPearson coefficient: %s\nPval:%s' \ %(splice_status, pearsonr, pearsonpval) spearmanr, spearmanpval = \ stats.spearmanr(mrna_diff_vector, lfq_diff_vector) print 'Gene set:%s\nSpearman coefficient: %s\nPval:%s' \ %(splice_status, spearmanr, spearmanpval) slope, intercept, r_value, p_value, std_err = stats.linregress(mrna_diff_vector,lfq_diff_vector) print 'slope: %s\nintercept: %s\nr_value: %s\nstd_error: %s' %(slope, intercept, r_value, std_err) # calculate concordants concord_count = 0 all_count = 0 for mrna, lfq in zip(mrna_diff_vector, lfq_diff_vector): if mrna * lfq >= 0: # means concordant concord_count += 1 all_count += 1 frac_concord = float(concord_count) / all_count print 'Gene set:%s\nConcordance:%s/%s, %s' %(splice_status, concord_count, all_count, frac_concord) # Scatterplot data scatter_plot_lfq_mrna(lfq_mrna_dic, spliced_genes, spliced_only=spliced_only, title=title, xlabel=xlabel, ylabel=ylabel, annotated_gene_list=annotated_gene_list)