def test_correlation_matrix(self): """Correlations in matrix should match values from R""" a = [2, 4, 6, 8] b = [1.5, 1.4, 1.2, 1.1] c = [15, 10, 5, 20] m = correlation_matrix([a, b, c]) self.assertFloatEqual(m[0, 0], [1.0]) self.assertFloatEqual([m[1, 0], m[1, 1]], [correlation(b, a)[0], 1.0]) self.assertFloatEqual(m[2], [correlation(c,a)[0], correlation(c,b)[0], \ 1.0])
def test_correlation_matrix(self): """Correlations in matrix should match values from R""" a = [2,4,6,8] b = [1.5, 1.4, 1.2, 1.1] c = [15, 10, 5, 20] m = correlation_matrix([a,b,c]) self.assertFloatEqual(m[0,0], [1.0]) self.assertFloatEqual([m[1,0], m[1,1]], [correlation(b,a)[0], 1.0]) self.assertFloatEqual(m[2], [correlation(c,a)[0], correlation(c,b)[0], \ 1.0])
def evaluate_test_dataset(observed_table,expected_table): """ evaluate the correlation between an observed and expected biom table. Returns data points for a scatter plot of observed v. expected values, and a dict of correlations keyed by method (each containing the r value, then the probability) """ # identify the overlapping otus that can be used to predict metagenomes overlapping_ids = list(set(observed_table.ids(axis='observation')) & set(expected_table.ids(axis='observation'))) if len(overlapping_ids) < 1: print "obs ids:",observed_table.ids(axis='observation')[0:10] print "exp ids:",expected_table.ids(axis='observation')[0:10] raise ValueError,\ "No ids are in common between the observed and expected tables, so no evaluations can be performed." # create lists to contain filtered data - we're going to need the data in # numpy arrays, so it makes sense to compute this way rather than filtering # the tables obs_data = [] exp_data = [] # build lists of filtered data for obs_id in overlapping_ids: obs_data.append(observed_table.data(obs_id, axis='observation')) exp_data.append(expected_table.data(obs_id, axis='observation')) flat_obs_data = ravel(array(obs_data)) flat_exp_data = ravel(array(exp_data)) #GET THE SCATTER PLOT POINTS scatter_data_points =\ zip(flat_obs_data,flat_exp_data) correlations = {} if len(scatter_data_points) <= 2: #can't validly calc correlation correlations["pearson"] = (None,None) correlations["spearman"] = (None,None) return scatter_data_points,correlations # CALCULATE CORRELATIONS pearson_r,pearson_t_prob =\ correlation(flat_obs_data,flat_exp_data) correlations["pearson"] = (pearson_r,pearson_t_prob) pearson_r2 = pearson_r**2 correlations["pearson_r2"] = [pearson_r2] spearman_r,spearman_t_prob =\ spearman_correlation(flat_obs_data,flat_exp_data) correlations["spearman"] = (spearman_r,spearman_t_prob) spearman_r2 = spearman_r**2 correlations["spearman_r2"] = [spearman_r2] return scatter_data_points,correlations
def run_single_correlation(OTU, category_info, otu_sample_info): """runs pearson correlation on the designated OTU """ result = {} #get a list of values for each category OTU_abundance_values = [] category_values = [] sample_info = otu_sample_info[OTU] for sample in category_info: # even if this OTU is not observed, we can use count=0 if sample in sample_info: count = sample_info[sample] else: count = 0 try: cat_val = float(category_info[sample]) category_values.append(cat_val) OTU_abundance_values.append(float(count)) except ValueError: raise ValueError( "The category values must be numeric to use the correlation option" ) r, prob = correlation(Numbers(OTU_abundance_values), Numbers(category_values)) return r, prob
def run_single_correlation(OTU_abundance_values, category_values, \ filter=1): """runs pearson correlation on the designated OTU """ number_samples = len(category_values) if len(category_values) >= int(filter): r, prob = correlation(Numbers(category_values), \ Numbers(OTU_abundance_values)) return r, prob else: return None, None
def test_compare(self): """compares internal asa to stride.""" self.input_file = os.path.join('data', '2E12.pdb') self.input_structure = PDBParser(open(self.input_file)) try: asa.asa_xtra(self.input_structure, mode='stride') except ApplicationNotFoundError: return asa.asa_xtra(self.input_structure) self.input_structure.propagateData(sum, 'A', 'ASA', xtra=True) residues = einput(self.input_structure, 'R') asa1 = [] asa2 = [] for residue in residues.selectChildren('H_HOH', 'ne', 'name').values(): asa1.append(residue.xtra['ASA']) asa2.append(residue.xtra['STRIDE_ASA']) self.assertAlmostEqual(correlation(asa1, asa2)[1], 0.)
def test_compare(self): """compares internal asa to stride.""" self.input_file = os.path.join('data', '2E12.pdb') self.input_structure = PDBParser(open(self.input_file)) try: asa.asa_xtra(self.input_structure, mode='stride') except ApplicationNotFoundError: return asa.asa_xtra(self.input_structure) self.input_structure.propagateData(sum, 'A', 'ASA', xtra=True) residues = einput(self.input_structure, 'R') asa1 = [] asa2 = [] for residue in residues.selectChildren('H_HOH', 'ne', 'name').values(): asa1.append(residue.xtra['ASA']) asa2.append(residue.xtra['STRIDE_ASA']) self.assertAlmostEqual(correlation(asa1, asa2)[1], 0.)
def test_correlation(self): """Correlations and significance should match R's cor.test()""" x = [1,2,3,5] y = [0,0,0,0] z = [1,1,1,1] a = [2,4,6,8] b = [1.5, 1.4, 1.2, 1.1] c = [15, 10, 5, 20] bad = [1,2,3] #originally gave r = 1.0000000002 self.assertFloatEqual(correlation(x,x), (1, 0)) self.assertFloatEqual(correlation(x,y), (0,1)) self.assertFloatEqual(correlation(y,z), (0,1)) self.assertFloatEqualAbs(correlation(x,a), (0.9827076, 0.01729), 1e-5) self.assertFloatEqualAbs(correlation(x,b), (-0.9621405, 0.03786), 1e-5) self.assertFloatEqualAbs(correlation(x,c), (0.3779645, 0.622), 1e-3) self.assertEqual(correlation(bad,bad), (1, 0))
def test_correlation(self): """Correlations and significance should match R's cor.test()""" x = [1, 2, 3, 5] y = [0, 0, 0, 0] z = [1, 1, 1, 1] a = [2, 4, 6, 8] b = [1.5, 1.4, 1.2, 1.1] c = [15, 10, 5, 20] bad = [1, 2, 3] #originally gave r = 1.0000000002 self.assertFloatEqual(correlation(x, x), (1, 0)) self.assertFloatEqual(correlation(x, y), (0, 1)) self.assertFloatEqual(correlation(y, z), (0, 1)) self.assertFloatEqualAbs(correlation(x, a), (0.9827076, 0.01729), 1e-5) self.assertFloatEqualAbs(correlation(x, b), (-0.9621405, 0.03786), 1e-5) self.assertFloatEqualAbs(correlation(x, c), (0.3779645, 0.622), 1e-3) self.assertEqual(correlation(bad, bad), (1, 0))
def spearman_correlation(x_array, y_array, tails="two-tailed"): """calculate the Spearman rank correlation for x and y x_array -- a 1D NumPy array y_array -- a 1D NumPy array """ # Convert absolute values to ranks x_ranks = convert_vals_to_spearman_ranks(x_array) y_ranks = convert_vals_to_spearman_ranks(y_array) # Now we get r by performing Pearson correlation # on the rank data. r, pearson_prob = correlation(x_ranks, y_ranks) # However, the conversion to ranks affects the prob # so we need the corrected version of the t statistic # not the generic version used by Pearson correlation spearman_t_prob = calc_spearman_t(r, n=len(x_array), tails=tails) # return r,spearman_t_prob return r, spearman_t_prob
def spearman_correlation(x_array, y_array, tails='two-tailed'): """calculate the Spearman rank correlation for x and y x_array -- a 1D NumPy array y_array -- a 1D NumPy array """ #Convert absolute values to ranks x_ranks = convert_vals_to_spearman_ranks(x_array) y_ranks = convert_vals_to_spearman_ranks(y_array) #Now we get r by performing Pearson correlation #on the rank data. r, pearson_prob = correlation(x_ranks, y_ranks) #However, the conversion to ranks affects the prob #so we need the corrected version of the t statistic #not the generic version used by Pearson correlation spearman_t_prob =\ calc_spearman_t(r,n=len(x_array),tails=tails) #return r,spearman_t_prob return r, spearman_t_prob
def run_single_correlation(OTU_abundance_values, category_values): """runs pearson correlation on the designated OTU """ return correlation(Numbers(category_values), Numbers(OTU_abundance_values))
def hommola_cospeciation_test(host_dist, par_dist, matrix, permutations): """Performs the cospeciation test from Hommola et al recursively over a tree. Takes numpy matrices of jxj host distances, ixi 'parasite' (OTU) distances, and a binary ixj association matrix. test data from Hommola et al MB&E 2009: hdist = numpy.array([[0,3,8,8,9],[3,0,7,7,8],[8,7,0,6,7],[8,7,6,0,3],[9,8,7,3,0]]) pdist = numpy.array([[0,5,8,8,8],[5,0,7,7,7],[8,7,0,4,4],[8,7,4,0,2],[8,7,4,2,0]]) int = numpy.array([[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,1,1]]) This is basically a direct translation from the R code, and not optimized in any way for Python. NOTE: the method return signature is now changed. For backwards compatibility purposes - when this method is called, 'result' has changed to 'result[0]' """ import cogent.maths.stats.test as stats from random import shuffle import numpy # for testing import math m = matrix.sum() hosts = [0] * m pars = [0] * m # Generate lists of host and symbiont edges, such that the index # of the lists represents an edge connecting the host to the parasite. s = 0 while s < m: for i in range(matrix.shape[0]): for j in range(matrix.shape[1]): if matrix[i, j] == 1: hosts[s] = j pars[s] = i s += 1 # get a vector of pairwise distances for each interaction edge x = get_dist(hosts, host_dist, range(matrix.shape[1])) y = get_dist(pars, par_dist, range(matrix.shape[0])) # calculate the observed correlation coefficient for this host/symbionts r = stats.correlation(x, y)[0] # now do permutaitons. Initialize index lists of the appropriate size. mp = range(par_dist.shape[1]) mh = range(host_dist.shape[1]) below = 0 perm_stats = [] # initialize list of shuffled correlation vals for i in range(permutations): # Generate a shuffled list of indexes for each permutation. This effectively # randomizes which host is associated with which symbiont, but maintains # the distribution of genetic distances. shuffle(mp) shuffle(mh) # Get pairwise distances in shuffled order y_p = get_dist(pars, par_dist, mp) x_p = get_dist(hosts, host_dist, mh) # calculate shuffled correlation. # If greater than observed value, iterate counter below. r_p = stats.correlation(x_p, y_p)[0] perm_stats.append(r_p) if r_p >= r: below += 1 # print "Below: " + str(below) # print "Pemutations: " + str(permutations) p_val = float(below + 1) / float(permutations + 1) return p_val, r, perm_stats
def evaluate_test_dataset(observed_table, expected_table): """ evaluate the correlation between an observed and expected biom table. Returns data points for a scatter plot of observed v. expected values, and a dict of correlations keyed by method (each containing the r value, then the probability) """ # identify the overlapping otus that can be used to predict metagenomes overlapping_ids = list( set(observed_table.ObservationIds) & set(expected_table.ObservationIds)) #print "dir(observed_table):\n",dir(observed_table) #print overlapping_ids if len(overlapping_ids) < 1: print "obs ids:", observed_table.ObservationIds[0:10] print "exp ids:", expected_table.ObservationIds[0:10] raise ValueError,\ "No ids are in common between the observed and expected tables, so no evaluations can be performed." # create lists to contain filtered data - we're going to need the data in # numpy arrays, so it makes sense to compute this way rather than filtering # the tables obs_data = [] exp_data = [] # build lists of filtered data for obs_id in overlapping_ids: obs_data.append(observed_table.observationData(obs_id)) exp_data.append(expected_table.observationData(obs_id)) #print obs_data #print exp_data flat_obs_data = ravel(array(obs_data)) flat_exp_data = ravel(array(exp_data)) #print flat_obs_data #print flat_exp_data #GET THE SCATTER PLOT POINTS scatter_data_points =\ zip(flat_obs_data,flat_exp_data) correlations = {} if len(scatter_data_points) <= 2: #can't validly calc correlation correlations["pearson"] = (None, None) correlations["spearman"] = (None, None) return scatter_data_points, correlations # CALCULATE CORRELATIONS pearson_r,pearson_t_prob =\ correlation(flat_obs_data,flat_exp_data) correlations["pearson"] = (pearson_r, pearson_t_prob) pearson_r2 = pearson_r**2 correlations["pearson_r2"] = [pearson_r2] spearman_r,spearman_t_prob =\ spearman_correlation(flat_obs_data,flat_exp_data) correlations["spearman"] = (spearman_r, spearman_t_prob) spearman_r2 = spearman_r**2 correlations["spearman_r2"] = [spearman_r2] return scatter_data_points, correlations
def run_single_correlation(OTU_abundance_values, category_values): """runs pearson correlation on the designated OTU """ return correlation(Numbers(category_values), Numbers(OTU_abundance_values))
def plot_regression_line(x,y,line_color='r', axes=None, prob_axes=False, \ axis_range=None): """Plots the regression line, and returns the equation. x and y are the x and y data for a single series line_color is a matplotlib color, will be used for the line axes is the name of the axes the regression will be plotted against prob_axes, if true, forces the axes to be between 0 and 1 range, if not None, forces the axes to be between (xmin, xmax, ymin, ymax). """ if axes is None: axes = gca() m, b = regress(x, y) r, significance = correlation(x,y) #set the a, b, and r values. a is the slope, b is the intercept. r_str = '%0.3g'% (r**2) m_str ='%0.3g' % m b_str = '%0.3g' % b #want to clip the line so it's contained entirely within the graph #coordinates. Basically, we need to find the values of y where x #is at x_min and x_max, and the values of x where y is at y_min and #y_max. #if we didn't set prob_axis or axis_range, just find empirical x and y if (not prob_axes) and (axis_range is None): x1, x2 = min(x), max(x) y1, y2 = m*x1 + b, m*x2 + b x_min, x_max = x1, x2 else: if prob_axes: x_min, x_max = 0, 1 y_min, y_max = 0, 1 else: #axis range must have been set x_min, x_max, y_min, y_max = axis_range #figure out bounds for x_min and y_min y_at_x_min = m*x_min + b if y_at_x_min < y_min: #too low: find x at y_min y1 = y_min x1 = (y_min-b)/m elif y_at_x_min > y_max: #too high: find x at y_max y1 = y_max x1 = (y_max-b)/m else: #just right x1, y1 = x_min, y_at_x_min y_at_x_max = m*x_max + b if y_at_x_max < y_min: #too low: find x at y_min y2 = y_min x2 = (y_min-b)/m elif y_at_x_max > y_max: #too high: find x at y_max y2 = y_max x2 = (y_max-b)/m else: #just right x2, y2 = x_max, y_at_x_max #need to check that the series wasn't entirely in range if (x_min <= x1 <= x_max) and (x_min <= x2 <= x_max): axes.plot([x1,x2],[y1,y2], color=line_color, linewidth=0.5) if b >= 0: sign_str = ' + ' else: sign_str = ' ' equation=''.join(['y= ',m_str,'x',sign_str,b_str,'\nr$^2$=',r_str]) return equation, line_color
def plot_regression_line(x,y,line_color='r', axes=None, prob_axes=False, \ axis_range=None): """Plots the regression line, and returns the equation. x and y are the x and y data for a single series line_color is a matplotlib color, will be used for the line axes is the name of the axes the regression will be plotted against prob_axes, if true, forces the axes to be between 0 and 1 range, if not None, forces the axes to be between (xmin, xmax, ymin, ymax). """ if axes is None: axes = gca() m, b = regress(x, y) r, significance = correlation(x,y) #set the a, b, and r values. a is the slope, b is the intercept. r_str = '%0.3g'% (r**2) m_str ='%0.3g' % m b_str = '%0.3g' % b #want to clip the line so it's contained entirely within the graph #coordinates. Basically, we need to find the values of y where x #is at x_min and x_max, and the values of x where y is at y_min and #y_max. #if we didn't set prob_axis or axis_range, just find empirical x and y if (not prob_axes) and (axis_range is None): x1, x2 = min(x), max(x) y1, y2 = m*x1 + b, m*x2 + b x_min, x_max = x1, x2 else: if prob_axes: x_min, x_max = 0, 1 y_min, y_max = 0, 1 else: #axis range must have been set x_min, x_max, y_min, y_max = axis_range #figure out bounds for x_min and y_min y_at_x_min = m*x_min + b if y_at_x_min < y_min: #too low: find x at y_min y1 = y_min x1 = (y_min-b)/m elif y_at_x_min > y_max: #too high: find x at y_max y1 = y_max x1 = (y_max-b)/m else: #just right x1, y1 = x_min, y_at_x_min y_at_x_max = m*x_max + b if y_at_x_max < y_min: #too low: find x at y_min y2 = y_min x2 = (y_min-b)/m elif y_at_x_max > y_max: #too high: find x at y_max y2 = y_max x2 = (y_max-b)/m else: #just right x2, y2 = x_max, y_at_x_max #need to check that the series wasn't entirely in range if (x_min <= x1 <= x_max) and (x_min <= x2 <= x_max): axes.plot([x1,x2],[y1,y2], color=line_color, linewidth=0.5) if b >= 0: sign_str = ' + ' else: sign_str = ' ' equation=''.join(['y= ',m_str,'x',sign_str,b_str,'\nr$^2$=',r_str]) return equation, line_color