def compute_mncp(predicted, cutoff, label): """ This is the MNCP computation adopted from Clarke 2003 MNCP is a rank based metric similar to AUC but its a plot of TP and all positives hence considered to be less affected by false positives. MNCP is the mean normalized """ from numpy import mean, array, hstack if label == 1: fg_vals = predicted[:cutoff] bg_vals = predicted[cutoff:] else: fg_vals = predicted[cutoff:] bg_vals = predicted[:cutoff] fg_len = len(fg_vals) total_len = len(fg_vals) + len(bg_vals) if type(fg_vals) != type(array([])): fg_vals = array(fg_vals) if type(bg_vals) != type(array([])): bg_vals = array(bg_vals) # Rank the data fg_rank = stats.rankdata(fg_vals) #combine foreground and background data and get the ranks total_rank = stats.rankdata(hstack((fg_vals, bg_vals))) slopes = [] for i in range(len(fg_vals)): slope = ((fg_len - fg_rank[i] + 1) / fg_len) / ((total_len - total_rank[i] + 1) / total_len) slopes.append(slope) mncp = mean(slopes) return mncp
def rank_texts(cls): """ Get total citation counts and ranks for texts. Returns: list """ count = fn.Count(Citation.id) query = ( Text.select(Text, count) .join(Citation) .where(Text.display == True) .where(Text.valid == True) .group_by(Text.id) .order_by(Text.id) .naive() ) counts = [t.count for t in query] # Compute dense-rank ratios. dense_ranks = rankdata(counts, "dense") top = max(dense_ranks) scores = [float(r / top) for r in dense_ranks] # Compute overall ranks (#1 is most frequent). max_ranks = rankdata(counts, "max") top = max(max_ranks) ranks = [int(top - r + 1) for r in max_ranks] return [dict(zip(["text", "rank", "score"], t)) for t in zip(query, ranks, scores)]
def get_scores(self, a, b): to_ret = (rankdata(a, 'dense') / np.max(rankdata(a, 'dense')) - rankdata(b, 'dense') / np.max(rankdata(b, 'dense'))) if type(a) == pd.Series: return pd.Series(to_ret, index=a.index) return to_ret
def test_DissimilarityConsistencyMeasure(): targets = np.tile(xrange(3),2) chunks = np.repeat(np.array((0,1)),3) # correct results cres1 = 0.41894348 cres2 = np.array([[ 0.16137995, 0.73062639, 0.59441713]]) dc1 = data[0:3,:] - np.mean(data[0:3,:],0) dc2 = data[3:6,:] - np.mean(data[3:6,:],0) center = squareform(np.corrcoef(pdist(dc1,'correlation'),pdist(dc2,'correlation')), checks=False).reshape((1,-1)) dsm1 = stats.rankdata(pdist(data[0:3,:],'correlation').reshape((1,-1))) dsm2 = stats.rankdata(pdist(data[3:6,:],'correlation').reshape((1,-1))) spearman = squareform(np.corrcoef(np.vstack((dsm1,dsm2))), checks=False).reshape((1,-1)) ds = dataset_wizard(samples=data, targets=targets, chunks=chunks) dscm = DissimilarityConsistencyMeasure() res1 = dscm(ds) dscm_c = DissimilarityConsistencyMeasure(center_data=True) res2 = dscm_c(ds) dscm_sp = DissimilarityConsistencyMeasure(consistency_metric='spearman') res3 = dscm_sp(ds) ds.append(ds) chunks = np.repeat(np.array((0,1,2,)),4) ds.sa['chunks'] = chunks res4 = dscm(ds) assert_almost_equal(np.mean(res1.samples),cres1) assert_array_almost_equal(res2.samples, center) assert_array_almost_equal(res3.samples, spearman) assert_array_almost_equal(res4.samples,cres2)
def match_number_density(dats, nd=None, mstar=None): """ Cuts catalogs at a stellar mass such that the number density matches that found in Hearin and Watson """ new_dats = defaultdict(dict) if nd is None: fiducial = get_catalog('HW')['dat'] m_f = fiducial['dat']['mstar'] n_f= rankdata(-m_f)/fiducial['box_size']**3 nd = max(n_f) print nd for name, cat in dats.items(): m = cat['dat']['mstar'] n = rankdata(-m)/cat['box_size']**3 m_s, n_s = zip(*sorted(zip(m,n))) idx = np.digitize(nd, n_s, right=True) ms_cut = m_s[min(idx, len(m_s)-1)] nd_cut = n_s[min(idx, len(n_s)-1)] print "Cut in ", name, " at ", ms_cut, " with nd: ", nd_cut d = cat['dat'] new_cat = cat.copy() new_cat['cut'] = ms_cut new_cat['dat'] = d[d['mstar'] > ms_cut] new_dats[name] = new_cat return new_dats
def to_dict(self): params = self.regression_params result = self.result good_rows = self._possible_rows() row_ids = self._row_id_array()[good_rows] group_data = self._group_data(good_rows) groups = group_data['group_array'][good_rows] iv = result['iv'] dv = result['dv'] weights = self._non_censored_mask()[good_rows] logger.debug(repr(iv.shape)) logger.debug(repr(weights.shape)) xvals = ss.rankdata(iv) yvals = ss.rankdata(dv) regression_line = self._estimate_regression_line(xvals, yvals) logger.debug(xvals) logger.debug(yvals) points = np.column_stack((row_ids, xvals, yvals, weights, groups)) all_point_data = np.column_stack((points, iv, dv)) logger.debug(self._all_point_cols) col_names = self._all_point_cols() x_label, y_label = self._x_y_labels() return dict( points=points.tolist(), stats_diagnostics=self.diagnostics_list(), all_point_data=all_point_data.tolist(), all_point_cols=self._all_point_cols(), regression_line=regression_line, group_list=group_data['group_list'], x_label=x_label, y_label=y_label, model_type=params.model_type)
def _call(self, dataset): """Computes the aslmap_dcm = sl_dcm(group_data)verage correlation in similarity structure across chunks.""" chunks_attr = self.chunks_attr nchunks = len(np.unique(dataset.sa[chunks_attr])) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") #calc neur sim b/w targ_comp targets per subject neur_sim={} for s in np.unique(dataset.sa[chunks_attr]): ds_s = dataset[dataset.sa.chunks == s] neur_sim[s+'1'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp1[0]],ds_s[ds_s.sa.targets == self.targ_comp1[1]])[0][1] neur_sim[s+'2'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp2[0]],ds_s[ds_s.sa.targets == self.targ_comp2[1]])[0][1] #combine xSs_behavs xSs_behav = {} for s in self.xSs_behav1: xSs_behav[s+'1'] = self.xSs_behav1[s] for s in self.xSs_behav2: xSs_behav[s+'2'] = self.xSs_behav2[s] #create dsets where cols are neural sim and mt sim for correlations behav_neur = np.array([[xSs_behav[s],neur_sim[s]] for s in neur_sim]) #correlate behav with neur sim b/w subjects if self.comparison_metric == 'spearman': xSs_corr = pearsonr(rankdata(behav_neur[:,0]),rankdata(behav_neur[:,1])) xSs_corr = pearsonr(behav_neur[:,0],behav_neur[:,1]) #returns fish z transformed r coeff ; could change to be p value if wanted... return Dataset(np.array([np.arctanh(xSs_corr[0])]))
def t2p(distr, alternative="greater", plus1=True): r""" Use the empirical distribution of a test statistic to compute p-values for every value in the distribution. Parameters ---------- distr : array_like Empirical distribution of statistic alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test (default is 'greater') plus1 : bool flag for whether to add 1 to the numerator and denominator of the p-value based on the empirical permutation distribution. Default is True. Returns ------- float the estimated p-vlaue """ if not alternative in ['greater', 'less', 'two-sided']: raise ValueError('Bad alternative') B = len(distr) if alternative != "less": pupper = 1 - rankdata(distr, method="min")/(plus1+B) + (1 + plus1)/(plus1+B) pvalue = pupper if alternative != "greater": plower = rankdata(distr, method="min") / (plus1+B) + plus1/(plus1+B) pvalue = plower if alternative == "two-sided": pvalue = np.min([np.ones(B), 2 * np.min([plower, pupper], 0)], 0) return pvalue
def plot_predicted_vs_observed_pks(obs_score, pred_score, ofname): plt.figure() heatmap, xedges, yedges = numpy.histogram2d( rankdata(-obs_score, method='ordinal'), rankdata(pred_score, method='ordinal'), bins=20) heatmap, xedges, yedges = numpy.histogram2d( numpy.clip(-numpy.log(1+obs_score), -0.1, 0), numpy.clip(numpy.log(1+pred_score), 0, 0.1), bins=100) #heatmap, xedges, yedges = numpy.histogram2d( # numpy.clip(-numpy.log(1+data['ATAC_mean']), -0.1, 0), # numpy.clip(numpy.log(y), 0, 0.1), # bins=100) #heatmap, xedges, yedges = numpy.histogram2d( # rankdata(-data['ATAC_mean'], method='average'), # rankdata(obs_score, method='average'), # bins=20) extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]] plt.clf() plt.imshow(heatmap, extent=extent) #plt.scatter(rankdata(obs_score, method='ordinal'), # rankdata(pred_score, method='ordinal')) plt.savefig(ofname) plt.close() return
def t2p(distr, alternative="greater"): ''' Use the empirical distribution of a test statistic to compute p-values for every value in the distribution. Parameters ---------- distr : array_like Empirical distribution of statistic alternative : {'greater', 'less', 'two-sided'} The alternative hypothesis to test (default is 'greater') Returns ------- float the estimated p-vlaue ''' B = len(distr) if alternative != "less": pupper = 1 - (rankdata(distr, method = "min") / B) + 1/B pvalue = pupper if alternative != "greater": plower = rankdata(distr, method = "min") / B pvalue = plower if alternative == "two-sided": pvalue = np.min([np.ones(B), 2 * np.min([plower, pupper], 0)], 0) return pvalue
def test_empty(self): """stats.rankdata([]) should return an empty array.""" a = np.array([], dtype=int) r = rankdata(a) assert_array_equal(r, np.array([], dtype=np.float64)) r = rankdata([]) assert_array_equal(r, np.array([], dtype=np.float64))
def rdm_similarity(ref_rdms, rdm, similarity_type='spearman', computation_method=None, rdm_as_list=False): """ Parameters ---------- computation_method : str or None can be 'spearmanr' or 'rankdata+cdist' # these two now only apply to spearman. if you specify this, then you must have matching `similarity_type`. similarity_type : str can only be 'spearman' now. ref_rdms: ndarray rdm: ndarray Returns ------- """ if similarity_type == 'spearman' and computation_method is None: computation_method = 'spearmanr' ref_rdms = np.atleast_2d(np.asarray(ref_rdms)) # deal with case like returning a tuple of stuff. rdm = np.asarray(rdm) assert type(rdm) == np.ndarray and type(ref_rdms) == np.ndarray assert ref_rdms.ndim == 2 if not rdm_as_list: rdm = np.atleast_2d(rdm.ravel()) # this is 1 x N assert rdm.ndim == 2 assert rdm.shape[1] == ref_rdms.shape[1] if computation_method == 'spearmanr': assert similarity_type == 'spearman' assert not rdm_as_list, 'only supporting one by one!' # if not, actually spearmanr will return a scalar instead. if ref_rdms.shape[0] >= 2: rdm_similarities = spearmanr(ref_rdms, rdm, axis=1).correlation[-1, :-1] else: # print('singular path!') rdm_similarities = np.atleast_1d(spearmanr(ref_rdms, rdm, axis=1).correlation) elif computation_method == 'rankdata+cdist': assert similarity_type == 'spearman' # do rank transform first, and then compute pearson. ref_rdms_ranked = np.array([rankdata(ref_rdm_this) for ref_rdm_this in ref_rdms]) if not rdm_as_list: rdm_ranked = np.atleast_2d(rankdata(rdm.ravel())) else: rdm_ranked = np.array([rankdata(rdm_this) for rdm_this in rdm]) rdm_similarities = 1 - cdist(rdm_ranked, ref_rdms_ranked, 'correlation') else: raise ValueError('unsupported computation method {}'.format(computation_method)) # rdm_similarities will be either a 1d stuff if not rdm_as_list, or a 2d stuff if rdm_as_list. assert rdm_similarities.ndim == 1 or rdm_similarities.ndim == 2 if not rdm_as_list: rdm_similarities = rdm_similarities.ravel() else: assert rdm_similarities.ndim == 2 return rdm_similarities
def spear(self, xs): self.y_temp = rankdata(self.distRelated) + (~self.maskDistJaccard)*LARGE_NUMBER yRanks = np.amin([self.y_temp, rankdata((self.maskDistJaccard) * LARGE_NUMBER + xs) + self.nb_y_below_t],axis=0) #print zip(yRanks[0::20],(self.y_temp)[0::20],(rankdata((self.maskDistJaccard) * LARGE_NUMBER + xs) + self.nb_y_below_t)[0::20], rankdata(xs)[0::20]) #print "numberptsbelowt: ", np.sum(self.maskDistJaccard) retval = 1 -np.sum(np.power(rankdata(xs) - yRanks,2)) return retval
def induce_correlations(data, corrmat): """ Induce a set of correlations on a column-wise dataset Parameters ---------- data : 2d-array An m-by-n array where m is the number of samples and n is the number of independent variables, each column of the array corresponding to each variable corrmat : 2d-array An n-by-n array that defines the desired correlation coefficients (between -1 and 1). Note: the matrix must be symmetric and positive-definite in order to induce. Returns ------- new_data : 2d-array An m-by-n array that has the desired correlations. """ # Create an rank-matrix data_rank = np.vstack([rankdata(datai) for datai in data.T]).T # Generate van der Waerden scores data_rank_score = data_rank / (data_rank.shape[0] + 1.0) data_rank_score = norm(0, 1).ppf(data_rank_score) # Calculate the lower triangular matrix of the Cholesky decomposition # of the desired correlation matrix p = chol(corrmat) # Calculate the current correlations t = np.corrcoef(data_rank_score, rowvar=0) # Calculate the lower triangular matrix of the Cholesky decomposition # of the current correlation matrix q = chol(t) # Calculate the re-correlation matrix s = np.dot(p, np.linalg.inv(q)) # Calculate the re-sampled matrix new_data = np.dot(data_rank_score, s.T) # Create the new rank matrix new_data_rank = np.vstack([rankdata(datai) for datai in new_data.T]).T # Sort the original data according to new_data_rank for i in range(data.shape[1]): vals, order = np.unique( np.hstack((data_rank[:, i], new_data_rank[:, i])), return_inverse=True ) old_order = order[: new_data_rank.shape[0]] new_order = order[-new_data_rank.shape[0] :] tmp = data[np.argsort(old_order), i][new_order] data[:, i] = tmp[:] return data
def test_predict_rank(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, self.X_train.shape[0] + 1) assert_array_less(-0.1, pred_ranks)
def test_one(self): """Check stats.rankdata with an array of length 1.""" data = [100] a = np.array(data, dtype=int) r = rankdata(a) assert_array_equal(r, np.array([1.0], dtype=np.float64)) r = rankdata(data) assert_array_equal(r, np.array([1.0], dtype=np.float64))
def getKeywords(cityA,cityB,nid_A,nid_B): features = cityA.features Aranks = rankdata(cityA.cat_pmi[nid_A]) Branks = rankdata(cityB.cat_pmi[nid_B]) good_features = (Aranks+Branks).argsort()[::-1] weights = np.sort(len(features)*2.0 - (Aranks+Branks))[::-1][:20] good_words = [features[i] for i in good_features[:20]] return good_words,weights
def test_predict_rank_normalized(self): pred_socres = self.clf.decision_function(self.X_test) pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) # assert the order is reserved assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) assert_array_less(pred_ranks, 1.01) assert_array_less(-0.1, pred_ranks)
def mutualInformation(A, B): X = rankdata(A) / len(A) Y = rankdata(B) / len(B) forTree = np.transpose(np.array([X, Y])) res = evalNearest(forTree) res = res / Gamma / np.power(len(A), alpha) res = np.log(res) res = res / (1 - alpha) return res
def pcorr(x,y,z,method): # Compute partial correlation coefficient between x and y given z # x and y are n-element arrays # z is an m x n element array # method is 'p' for Pearson or 's' for Spearman rank # This is a port of the "var-covar" method pcor.mat written in R at # http://www.yilab.gatech.edu/pcor.R # See http://www.yilab.gatech.edu/pcor.html for more info # Note: I've followed some of their weird naming conventions # e.g., Sxx is the covariance between x and y # Also note that cov returns different things in R and in numpy nData=len(x) nControl=(np.shape(z))[0] # number of control variables # The only difference between Pearson and Spearman is that # we rank the variables first for Spearman. In the large-N # limit they should also have the same null distribution (for pvalue) if(method=='p'): # pearson xc=x yc=y zc=z elif(method=='s'): # spearman rank xc=stats.rankdata(x) yc=stats.rankdata(y) zc=np.zeros_like(z) for ii in range(nControl): zc[ii]=stats.rankdata(z[ii]) else: print "Error in pcorr: must specify method" exit Sxx=np.cov(xc,yc) Sxz=np.zeros(2*nControl).reshape(2,nControl) Szz=np.zeros(nControl**2).reshape(nControl,nControl) for ii in range(nControl): Sxz[0,ii]=(np.cov(xc,zc[ii]))[0,1] Sxz[1,ii]=(np.cov(yc,zc[ii]))[0,1] for jj in range(nControl): Szz[ii,jj]=(np.cov(zc[ii],zc[jj]))[0,1] # Check that Szz is positive definite before inverting if(np.min(stats.stats.linalg.eigvals(Szz)) < 0): print "Error in pcorr: Szz is not positive definite" exit SzzInv=np.linalg.inv(Szz) Sxxz=Sxx - np.dot(np.dot(Sxz,SzzInv),Sxz.T) coeff=Sxxz[0,1]/(np.sqrt(Sxxz[0,0])*np.sqrt(Sxxz[1,1])) pvalue, significance=pcorr_pvalue(coeff, nData, nControl) return (coeff, pvalue, significance)
def process_file(args): df = pd.read_table(args.infile, index_col=None) if args.drop_paralogs: df = drop_all_paralogs(df) if args.min_length: df = df[df['ORIGINAL_SEQUENCE_LENGTH'] >= args.min_length] if args.restrict_to_zero: df = df[df['OFFSET_FROM_START'] == 0] if args.restrict_around: df = df[(df['OFFSET_FROM_START'] >= args.restrict_around[0]) & \ (df['OFFSET_FROM_START'] <= args.restrict_around[1])] if args.remove_sequence: df = df[df['ORIGINAL_SEQUENCE'] != args.remove_sequence] if args.endswith: criterion = df['ORIGINAL_SEQUENCE'].map( lambda x: x.endswith(args.endswith)) df = df[criterion] if args.threeprime_startswith: df = df[df['THREEPRIME_OF_CLEAVAGE'].map( lambda x: x.startswith(args.threeprime_startswith))] if args.three_prime_trimmed is not None: if args.three_prime_trimmed == '': df = df[df['3PTRIMMED'].isnull()] else: df = df[df['3PTRIMMED'] == args.three_prime_trimmed] if args.normalize_by_num_maps: df = calculate_num_times_map(df) df['WEIGHT'] = df['COUNT'] / df['NUM_TIMES_MAP'] df['WEIGHT'] = df['WEIGHT'].fillna(0) if args.use_rank: if 'WEIGHT' in df.columns: df['WEIGHT'] = rankdata(df['WEIGHT']) else: df['WEIGHT'] = rankdata(df['COUNT']) if not ('WEIGHT' in df.columns): df['WEIGHT'] = df['COUNT'] # combine upstream and downstream into one sequence if args.combine_sequences: left = df['ORIGINAL_SEQUENCE'].map(lambda x: x[args.combine_sequences[0]:]) right = df['THREEPRIME_OF_CLEAVAGE'].map(lambda x: x[:args.combine_sequences[1]]) df['FINAL_SEQUENCE'] = left + right # get the selected columns if 'all' not in args.columns: df = df[args.columns] # get rid of empty and null sequences if args.threeprime_of_cleavage: nucs = df[['THREEPRIME_OF_CLEAVAGE', 'COUNT']] nucs = nucs[nucs['THREEPRIME_OF_CLEAVAGE'] != ''] nucs = nucs[not nucs['THREEPRIME_OF_CLEAVAGE'].isnull()] export = explode_series(nucs['THREEPRIME_OF_CLEAVAGE']) export = nucleotide_frequencies(export, count_series=nucs['COUNT'], normalize=True, ignore_ns=args.ignore_ns) export.to_csv(args.outfile, sep='\t') else: df.to_csv(args.outfile, sep='\t', index=False)
def train(self, features, labels): from milk.unsupervised import zscore from scipy import stats if self.axis == 0: rlabels = np.array([stats.rankdata(ells) for ells in labels]) else: rlabels = np.array([stats.rankdata(ells) for ells in labels.T]) rlabels = rlabels.T rlabels[np.isnan(labels)] = np.nan return self.base.train(features, rlabels)
def scale_neg_1_to_1_with_zero_mean_rank_abs_max(v): rankv = v * 2 - 1 pos_v = rankv[rankv > 0] pos_v = rankdata(pos_v, 'dense') pos_v = pos_v / pos_v.max() neg_v = rankv[rankv < 0] neg_v = rankdata(neg_v, 'dense') neg_v = neg_v / neg_v.max() rankv[rankv > 0] = pos_v rankv[rankv < 0] = - (neg_v.max() - neg_v) return scale_neg_1_to_1_with_zero_mean_abs_max(rankv)
def test_large_int(self): data = np.array([2**60, 2**60+1], dtype=np.uint64) r = rankdata(data) assert_array_equal(r, [1.0, 2.0]) data = np.array([2**60, 2**60+1], dtype=np.int64) r = rankdata(data) assert_array_equal(r, [1.0, 2.0]) data = np.array([2**60, -2**60+1], dtype=np.int64) r = rankdata(data) assert_array_equal(r, [2.0, 1.0])
def estimate_normalization_constant(readData, parameter): '''Estimate the normalization constant for all samples''' # if user tell PePr not to estimate normalization constants if parameter.normalization != "YES": if parameter.normalization == "NO": norm_constants = [1.0]*len(readData.filename_list) else: norm = parameter.normalization.strip().split(',') if len(norm) != len(readData.filename_list): raise Exception('''The number of normalization constants does not match the number of samples. Quiting..''') else: norm_constants = [float(x) for x in norm] for idx,file in enumerate(readData.filename_list): readData.normalization_constant[file] = norm_constants[idx] return # Split the genome into 1kb windows. bin = 1000 array_dict = {} for file in readData.filename_list: array = numpy.array([], dtype=numpy.float64) for chr in readData.chr_list: row_num = int(readData.chr_length_dict[chr]/bin) array_by_chr = numpy.zeros(row_num, dtype=numpy.float64) for x in readData.data_dict[chr][file]: try: array_by_chr[int(x/bin)] += 1 except IndexError: pass array = numpy.append(array, array_by_chr) array_dict[file] = array # Create a mixed chip sample and use it as the reference mixed_chip_array = numpy.array([], dtype=numpy.float64) for idx, chip in enumerate(readData.chip_filename_list): if idx == 0: mixed_chip_array = array_dict[chip].copy() rep_rank_sum = rankdata(-array_dict[chip]) else: mixed_chip_array += array_dict[chip] rep_rank_sum += rankdata(-array_dict[chip]) mixed_chip_array /= len(readData.chip_filename_list) # Estimate the input normalization constant using NCIS for input in readData.input_filename_list: norm_constant = estimate_input_normalization( mixed_chip_array, array_dict[input]) readData.normalization_constant[input] = norm_constant debug("The scaling factor for %s is %s", input, norm_constant) # Estiamte the chip normalization cosntant using modified TMM method for chip in readData.chip_filename_list: norm_constant = estimate_chip_normalization( mixed_chip_array, array_dict[chip], rep_rank_sum) readData.normalization_constant[chip] = norm_constant debug("The scaling factor for %s is %s", chip, norm_constant) return
def plotScores(ax, scores, guideFreqs, scoreType, annotate, diam, doLegend=False): " create scatter plot " regrX = [] regrY = [] plotX = [] plotY = [] for extSeq, (guideName, modFreq) in guideFreqs.iteritems(): y = modFreq x = scores[extSeq][scoreType] regrX.append(x) regrY.append(y) # just for plot: adding jitter for a scoretype with many identical scores if scoreType.startswith('final'): x -= random.random()*0.25 plotX.append(x) plotY.append(y) # do not plot more than 3000 dots, makes PDF very slow to display #if len(plotX)>3000: #print "Sampling scatter plot points down to 3000 points" #allDots = [x, y for x, y in zip(plotX, plotY)] #allDots = random.sample(allDots, 3000) #plotX, plotY = zip(*allDots) ax.scatter(plotX, plotY, alpha=.5, marker="o", s=diam, linewidth=0) if scoreType in ["wang", "wangOrig"]: ax.set_xlim(0, 1.0) elif scoreType in ["doench"]: ax.set_xlim(0, 100) elif scoreType=="chariRank": ax.set_xlim(0, 100.0) slope, intercept, r_value, p_value, std_err = linregress(regrX,regrY) print "score type %s: Pearson R %f, P %f" % (scoreType, r_value, p_value) line = slope*np.asarray(regrX)+intercept ax.plot(regrX,line, linestyle='-', color="orange") pearR, pearP = pearsonr(regrX, regrY) spearR, spearP = spearmanr(rankdata(regrX), rankdata(regrY)) #mwU, mwP = mannwhitneyu(regrX, regrY) #ret = pearR ret = spearR #ax.annotate(r'Pearson R = %0.3f (p %0.3f)' % (pearR, pearP) + '\n' + r'Spearman $\rho$ = %0.3f (p %0.3f)' % (spearR, spearP) + "\nMann-Whitney U=%d (p=%0.3f)" % (int(mwU), mwP), xy=(0.40,0.08), fontsize=9, xycoords='axes fraction') ax.annotate(r'Pearson R = %0.3f (p %0.3f)' % (pearR, pearP) + '\n' + r'Spearman $\rho$ = %0.3f (p %0.3f)' % (spearR, spearP), xy=(0.40,0.06), fontsize=9, xycoords='axes fraction') return ret
def rank_all(self): lec_scores = np.vstack((self.refs, self.best_lec_des, self.lec_scores_best)) lec_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -lec_scores.T]) lmnn_scores = np.vstack((self.refs, self.best_lmnn_des, self.lmnn_scores_best)) lmnn_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -lmnn_scores.T]) combine_scores = np.vstack((self.refs, self.best_combine_des, self.combine_scores_best)) combine_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -combine_scores.T]) all_scores = np.vstack((self.refs, self.des_scores_best, self.lec_scores_best, self.lmnn_scores_best, self.combine_scores_best)) all_rank = np.hstack([rankdata(a, 'min')[:, None] for a in -all_scores.T]) return lec_rank, lmnn_rank, combine_rank, all_rank
def rank_data(obs: np.ndarray, axis=0, ties='average'): """ Assign ranks to data, dealing with ties appropriately. This function works on core as well as vectors Parameters ---------- obs: ndarray Data to be ranked. Can only be 1 or 2 dimensional. axis: {0, 1}, optional The axis to perform the ranking. 0 means row, 1 means column. ties Returns ------- """ """ :param obs: numpy array :param axis: int, default 0 The axis to perform the ranking. 0 means row, 1 means column. :param ties: str, default 'average' The method used to assign ranks to tied elements. The options are 'average', 'min', 'max', 'dense' and 'ordinal'. 'average': The average of the ranks that would have been assigned to all the tied values is assigned to each value. 'min': The minimum of the ranks that would have been assigned to all the tied values is assigned to each value. (This is also referred to as "competition" ranking.) 'max': The maximum of the ranks that would have been assigned to all the tied values is assigned to each value. 'dense': Like 'min', but the rank of the next highest element is assigned the rank immediately after those assigned to the tied elements. 'ordinal': All values are given a distinct rank, corresponding to the order that the values occur in `a`. :return: numpy array matrix or vector of the same dimension as X containing the pseudo observations """ obs = np.asarray(obs) if obs.ndim == 1: return stats.rankdata(obs, ties) elif obs.ndim == 2: if axis == 0: return np.array([stats.rankdata(obs[i, :], ties) for i in range(obs.shape[0])]) return np.array([stats.rankdata(obs[:, i], ties) for i in range(obs.shape[1])]).T else: raise ValueError('Can only rank data which is 1 or 2 dimensions')
def kendall_w(list_all): rank_all = [] corrections = 0 for i in list_all: rank_all.append(rankdata(list_all[i])) tie_count = Counter(rankdata(list_all[i])) corrections += np.sum(tie_count[i]**3 - tie_count[i] for i in tie_count) rank_ = np.sum(rank_all, axis = 0) rank_bar = np.mean(rank_) S = np.sum([(i - rank_bar)**2 for i in rank_]) S_prime = np.sum([i**2 for i in rank_]) m = len(list_all); n = len(rank_) W = (12 * S_prime - 3*m**2*n*(n+1)**2) / (m**2 * (n**3 - n) - m*corrections) return W
def __init__(self, target_dsm, control_dsms = None, resid = False, pairwise_metric='correlation', comparison_metric='pearson', center_data = False, corrcoef_only = False, **kwargs): """ Initialize Parameters ---------- dataset : Dataset with N samples such that corresponding dissimilarity matrix has N*(N-1)/2 unique pairwise distances target_dsm : numpy array, length N*(N-1)/2. Target dissimilarity matrix this is the predictor who's results get mapped back control_dsms: list of numpy arrays, length N*(N-1)/2. DMs to be controlled for Default: 'None' controlled for when getting results of target_dsm back resid: Set to True to return residuals to searchlight center for smoothing estimation, default to False pairwise_metric : To be used by pdist to calculate dataset DSM Default: 'correlation', see scipy.spatial.distance.pdist for other metric options. comparison_metric : To be used for comparing dataset dsm with target dsm Default: 'pearson'. Options: 'pearson' or 'spearman' center_data : Center data by subtracting mean column values from columns prior to calculating dataset dsm. Default: False corrcoef_only : If true, return only the correlation coefficient (rho), otherwise return rho and probability, p. Default: False Returns ------- Dataset : Dataset contains the correlation coefficient (rho) only or rho plus p, when corrcoef_only is set to false. """ # init base classes first Measure.__init__(self, **kwargs) if comparison_metric not in ['spearman','pearson']: raise Exception("comparison_metric %s is not in " "['spearman','pearson']" % comparison_metric) self.target_dsm = target_dsm if comparison_metric == 'spearman': self.target_dsm = rankdata(target_dsm) self.pairwise_metric = pairwise_metric self.comparison_metric = comparison_metric self.center_data = center_data self.corrcoef_only = corrcoef_only self.control_dsms = control_dsms if comparison_metric == 'spearman' and control_dsms != None: self.control_dsms = [rankdata(dm) for dm in control_dsms] self.resid = resid
def test_on_set(fid, dataset, data_generator, label_generator, num_batches): '''Helper function that works for both training and validation sets''' print('Testing on {} data'.format(dataset)) # Need to process data using generator our_preds = [] true_preds = [] corr = 0 for batch_num in range(num_batches): (x, y) = data_generator.next() labels = label_generator.next() #raw_input('pause...') y = y[0] # only one output, which is True/False or yield # TODO: pre-fetch data in queue preds = model.predict_on_batch(x) for i in range(preds.shape[0]): edits = labels['candidate_edits'][i] pred = preds[i, :] trueprob = pred[y[i, :] != 0][ 0] # prob assigned to true outcome rank_true_edit = 1 + len(pred) - (ss.rankdata(pred))[np.argmax( y[i, :])] true_preds.append(trueprob) our_preds.append(pred[np.argmax(y[i, :])]) if np.argmax(pred) == np.argmax(y[i, :]): corr += 1 # Get most informative labels for the highest predictions if rank_true_edit != 1: # record highest probability most_likely_edit_i = np.argmax(pred) most_likely_prob = np.max(pred) else: # record number two prediction most_likely_edit_i = np.argmax(pred[pred != np.max(pred)]) most_likely_prob = np.max(pred[pred != np.max(pred)]) trueyield = 0.0 try: most_likely_smiles = labels['candidate_smiles'][i][ most_likely_edit_i] most_likely_edit = edits[most_likely_edit_i] except IndexError: most_likely_smiles = 'no_reaction' most_likely_edit = 'no_reaction' fid.write( '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format( labels['reaction_true'][i], dataset, edits[np.argmax(y[i, :])], trueprob, most_likely_edit, most_likely_prob, rank_true_edit, labels['reaction_true'][i].split('>')[-1], most_likely_smiles, labels['rxdid'][i], trueyield)) return our_preds, corr
# -*- coding: utf-8 -*- """ Created on Mon Jul 10 11:47:53 2017 @author: kcarnold """ import numpy as np from suggestion import suggestion_generator from scipy.special import logsumexp from scipy.stats import rankdata #%% model = suggestion_generator.get_model('yelp_train-balanced') #%% wf_bins_rank = rankdata(model.unigram_probs, method='average') # np.arange(len(model.unigram_probs_wordsonly))[ #wf_bins_rank = np.argsort(model.unigram_probs_wordsonly) wf_bins = (10 * wf_bins_rank / (wf_bins_rank.max() + 1)).astype(int) bin_counts = np.bincount(wf_bins) #%% for word in 'huevos tri place'.split(): idx = model.model.vocab_index(word) print(f"{model.unigram_probs_wordsonly[idx]:.2f}, bin={wf_bins[idx]}") #%% mean_probs = model.unigram_probs_wordsonly @ np.eye(10)[wf_bins] #%% # bin 6 seems high, and bin 1. Why? [model.id2str[idx] for idx in np.flatnonzero(wf_bins == 6)[:20]] [ wf_bins_rank[idx] / wf_bins_rank.max()
def rank_configs_by_width(models, C): mpiws = [pi.mw(model, C) for model in models] return rankdata(mpiws, 'max')
def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided", axis=0, method="auto"): r'''Perform the Mann-Whitney U rank test on two independent samples. The Mann-Whitney U test is a nonparametric test of the null hypothesis that the distribution underlying sample `x` is the same as the distribution underlying sample `y`. It is often used as a test of of difference in location between distributions. Parameters ---------- x, y : array-like N-d arrays of samples. The arrays must be broadcastable except along the dimension given by `axis`. use_continuity : bool, optional Whether a continuity correction (1/2) should be applied. Default is True when `method` is ``'asymptotic'``; has no effect otherwise. alternative : {'two-sided', 'less', 'greater'}, optional Defines the alternative hypothesis. Default is 'two-sided'. Let *F(u)* and *G(u)* be the cumulative distribution functions of the distributions underlying `x` and `y`, respectively. Then the following alternative hypotheses are available: * 'two-sided': the distributions are not equal, i.e. *F(u) ≠ G(u)* for at least one *u*. * 'less': the distribution underlying `x` is stochastically less than the distribution underlying `y`, i.e. *F(u) > G(u)* for all *u*. * 'greater': the distribution underlying `x` is stochastically greater than the distribution underlying `y`, i.e. *F(u) < G(u)* for all *u*. Under a more restrictive set of assumptions, the alternative hypotheses can be expressed in terms of the locations of the distributions; see [5] section 5.1. axis : int, optional Axis along which to perform the test. Default is 0. method : {'auto', 'asymptotic', 'exact'}, optional Selects the method used to calculate the *p*-value. Default is 'auto'. The following options are available. * ``'asymptotic'``: compares the standardized test statistic against the normal distribution, correcting for ties. * ``'exact'``: computes the exact *p*-value by comparing the observed :math:`U` statistic against the exact distribution of the :math:`U` statistic under the null hypothesis. No correction is made for ties. * ``'auto'``: chooses ``'exact'`` when the size of one of the samples is less than 8 and there are no ties; chooses ``'asymptotic'`` otherwise. Returns ------- res : MannwhitneyuResult An object containing attributes: statistic : float The Mann-Whitney U statistic corresponding with sample `x`. See Notes for the test statistic corresponding with sample `y`. pvalue : float The associated *p*-value for the chosen `alternative`. Notes ----- If ``U1`` is the statistic corresponding with sample `x`, then the statistic corresponding with sample `y` is `U2 = `x.shape[axis] * y.shape[axis] - U1``. `mannwhitneyu` is for independent samples. For related / paired samples, consider `scipy.stats.wilcoxon`. `method` ``'exact'`` is recommended when there are no ties and when either sample size is less than 8 [1]_. The implementation follows the recurrence relation originally proposed in [1]_ as it is described in [3]_. Note that the exact method is *not* corrected for ties, but `mannwhitneyu` will not raise errors or warnings if there are ties in the data. The Mann-Whitney U test is a non-parametric version of the t-test for independent samples. When the the means of samples from the populations are normally distributed, consider `scipy.stats.ttest_ind`. See Also -------- scipy.stats.wilcoxon, scipy.stats.ranksums, scipy.stats.ttest_ind References ---------- .. [1] H.B. Mann and D.R. Whitney, "On a test of whether one of two random variables is stochastically larger than the other", The Annals of Mathematical Statistics, Vol. 18, pp. 50-60, 1947. .. [2] Mann-Whitney U Test, Wikipedia, http://en.wikipedia.org/wiki/Mann-Whitney_U_test .. [3] A. Di Bucchianico, "Combinatorics, computer algebra, and the Wilcoxon-Mann-Whitney test", Journal of Statistical Planning and Inference, Vol. 79, pp. 349-364, 1999. .. [4] Rosie Shier, "Statistics: 2.3 The Mann-Whitney U Test", Mathematics Learning Support Centre, 2004. .. [5] Michael P. Fay and Michael A. Proschan. "Wilcoxon-Mann-Whitney or t-test? On assumptions for hypothesis tests and multiple \ interpretations of decision rules." Statistics surveys, Vol. 4, pp. 1-39, 2010. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2857732/ Examples -------- We follow the example from [4]_: nine randomly sampled young adults were diagnosed with type II diabetes at the ages below. >>> males = [19, 22, 16, 29, 24] >>> females = [20, 11, 17, 12] We use the Mann-Whitney U test to assess whether there is a statistically significant difference in the diagnosis age of males and females. The null hypothesis is that the distribution of male diagnosis ages is the same as the distribution of female diagnosis ages. We decide that a confidence level of 95% is required to reject the null hypothesis in favor of the alternative that that the distributions are different. Since the number of samples is very small and there are no ties in the data, we can compare the observed test statistic against the *exact* distribution of the test statistic under the null hypothesis. >>> from scipy.stats import mannwhitneyu >>> U1, p = mannwhitneyu(males, females, method="exact") >>> print(U1) 17.0 `mannwhitneyu` always reports the statistic associated with the first sample, which, in this case, is males. This agrees with :math:`U_M = 17` reported in [4]_. The statistic associated with the second statistic can be calculated: >>> nx, ny = len(males), len(females) >>> U2 = nx*ny - U1 >>> print(U2) 3.0 This agrees with :math:`U_F = 3` reported in [4]_. The two-sided *p*-value can be calculated from either statistic, and the value produced by `mannwhitneyu` agrees with :math:`p = 0.11` reported in [4]_. >>> print(p) 0.1111111111111111 The exact distribution of the test statistic is asymptotically normal, so the example continues by comparing the exact *p*-value against the *p*-value produced using the normal approximation. >>> _, pnorm = mannwhitneyu(males, females, method="asymptotic") >>> print(pnorm) 0.11134688653314041 Here `mannwhitneyu`'s reported *p*-value appears to conflict with the value :math:`p = 0.09` given in [4]_. The reason is that [4]_ does not apply the continuity correction performed by `mannwhitneyu`; `mannwhitneyu` reduces the distance between the test statistic and the mean :math:`\mu = n_x n_y / 2` by 0.5 to correct for the fact that the discrete statistic is being compared against a continuous distribution. Here, the :math:`U` statistic used is less than the mean, so we reduce the distance by adding 0.5 in the numerator. >>> import numpy as np >>> from scipy.stats import norm >>> U = min(U1, U2) >>> N = nx + ny >>> z = (U - nx*ny/2 + 0.5) / np.sqrt(nx*ny * (N + 1)/ 12) >>> p = 2 * norm.cdf(z) # use CDF to get p-value from smaller statistic >>> print(p) 0.11134688653314041 If desired, we can disable the continuity correction to get a result that agrees with that reported in [4]_. >>> _, pnorm = mannwhitneyu(males, females, use_continuity=False, ... method="asymptotic") >>> print(pnorm) 0.0864107329737 Regardless of whether we perform an exact or asymptotic test, the probability of the test statistic being as extreme or more extreme by chance exceeds 5%, so we do not consider the results statistically significant. Suppose that, before seeing the data, we had hypothesized that females would tend to be diagnosed at a younger age than males. In that case, it would be natural to provide the female ages as the first input, and we would have performed a one-sided test using ``alternative = 'less'``: females are diagnosed at an age that is stochastically less than that of males. >>> res = mannwhitneyu(females, males, alternative="less", method="exact") >>> print(res) MannwhitneyuResult(statistic=3.0, pvalue=0.05555555555555555) Again, the probability of getting a sufficiently low value of the test statistic by chance under the null hypothesis is greater than 5%, so we do not reject the null hypothesis in favor of our alternative. If it is reasonable to assume that the means of samples from the populations are normally distributed, we could have used a t-test to perform the analysis. >>> from scipy.stats import ttest_ind >>> res = ttest_ind(females, males, alternative="less") >>> print(res) Ttest_indResult(statistic=-2.239334696520584, pvalue=0.030068441095757924) Under this assumption, the *p*-value would be low enough to reject the null hypothesis in favor of the alternative. ''' x, y, use_continuity, alternative, axis_int, method = ( _mwu_input_validation(x, y, use_continuity, alternative, axis, method)) x, y, xy = _broadcast_concatenate(x, y, axis) n1, n2 = x.shape[-1], y.shape[-1] if method == "auto": method = _mwu_choose_method(n1, n2, xy, method) # Follows [2] ranks = stats.rankdata(xy, axis=-1) # method 2, step 1 R1 = ranks[..., :n1].sum(axis=-1) # method 2, step 2 U1 = R1 - n1 * (n1 + 1) / 2 # method 2, step 3 U2 = n1 * n2 - U1 # as U1 + U2 = n1 * n2 if alternative == "greater": U, f = U1, 1 # U is the statistic to use for p-value, f is a factor elif alternative == "less": U, f = U2, 1 # Due to symmetry, use SF of U2 rather than CDF of U1 else: U, f = np.maximum(U1, U2), 2 # multiply SF by two for two-sided test if method == "exact": p = _mwu_state.sf(U.astype(int), n1, n2) elif method == "asymptotic": z = _get_mwu_z(U, n1, n2, ranks, continuity=use_continuity) p = stats.norm.sf(z) p *= f # Ensure that test statistic is not greater than 1 # This could happen for exact test when U = m*n/2 p = np.clip(p, 0, 1) return MannwhitneyuResult(U1, p)
def ranks(scores, ascending=False): sign = 1 if ascending else -1 scores = scores * sign ranks = [stats.rankdata(scores[i])[0] for i in range(scores.shape[0])] return ranks
def get_background_rank_df(cls, frequency_path=None): df = cls.get_background_frequency_df(frequency_path) df['rank'] = rankdata(df.background, method='dense') df['background'] = df['rank'] / df['rank'].max() return df[['background']]
def fdr(p_vals): """Adjust p-values with Benjamini-Hochberg. Parameters ---------- data : array-like Returns ------- Pandas.DataFrame DataFrame where values are order of data. Examples -------- >>> np.random.seed(0) >>> x = np.random.random(10) / 100 >>> gr = pr.random(10) >>> gr.PValue = x >>> gr +--------------+-----------+-----------+--------------+----------------------+ | Chromosome | Start | End | Strand | PValue | | (category) | (int32) | (int32) | (category) | (float64) | |--------------+-----------+-----------+--------------+----------------------| | chr1 | 176601938 | 176602038 | + | 0.005488135039273248 | | chr1 | 155082851 | 155082951 | - | 0.007151893663724195 | | chr2 | 211134424 | 211134524 | - | 0.006027633760716439 | | chr9 | 78826761 | 78826861 | - | 0.005448831829968969 | | ... | ... | ... | ... | ... | | chr16 | 52216522 | 52216622 | + | 0.004375872112626925 | | chr17 | 8085927 | 8086027 | - | 0.008917730007820798 | | chr19 | 17333425 | 17333525 | + | 0.009636627605010294 | | chr22 | 16728001 | 16728101 | + | 0.003834415188257777 | +--------------+-----------+-----------+--------------+----------------------+ Stranded PyRanges object has 10 rows and 5 columns from 9 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. >>> gr.FDR = pr.stats.fdr(gr.PValue) >>> gr.print(formatting={"PValue": "{:.4f}", "FDR": "{:.4}"}) +--------------+-----------+-----------+--------------+-------------+-------------+ | Chromosome | Start | End | Strand | PValue | FDR | | (category) | (int32) | (int32) | (category) | (float64) | (float64) | |--------------+-----------+-----------+--------------+-------------+-------------| | chr1 | 176601938 | 176602038 | + | 0.0055 | 0.01098 | | chr1 | 155082851 | 155082951 | - | 0.0072 | 0.00894 | | chr2 | 211134424 | 211134524 | - | 0.0060 | 0.01005 | | chr9 | 78826761 | 78826861 | - | 0.0054 | 0.01362 | | ... | ... | ... | ... | ... | ... | | chr16 | 52216522 | 52216622 | + | 0.0044 | 0.01459 | | chr17 | 8085927 | 8086027 | - | 0.0089 | 0.009909 | | chr19 | 17333425 | 17333525 | + | 0.0096 | 0.009637 | | chr22 | 16728001 | 16728101 | + | 0.0038 | 0.03834 | +--------------+-----------+-----------+--------------+-------------+-------------+ Stranded PyRanges object has 10 rows and 6 columns from 9 chromosomes. For printing, the PyRanges was sorted on Chromosome and Strand. """ from scipy.stats import rankdata ranked_p_values = rankdata(p_vals) fdr = p_vals * len(p_vals) / ranked_p_values fdr[fdr > 1] = 1 return fdr
def match_digit(digits_used, anchor, y_pred, anchor_pred): y_pred_matched = np.zeros(y_pred.shape).astype(digits_used.dtype) for y_pred_i, rank in enumerate(stats.rankdata(anchor_pred)): digit = digits_used[np.argsort(anchor)[int(rank) - 1]] y_pred_matched[y_pred == y_pred_i] = digit return y_pred_matched
stackx_data["answer"] = stackx_data["answer"].astype(str) #print(stackx_data.shape) log.write("data loaded of size %s \n" % (str(stackx_data.shape))) # Normalize aux targets encoded = [] trange = tqdm(stackx_data["host"].unique()) for host in trange: host_mask = stackx_data["host"] == host trange.set_description(str(host)) host_labels = deepcopy(stackx_data[host_mask][TARGETS]) for col in [ "question_score", "question_views", "question_favs", "answer_score" ]: host_labels[col] = rankdata( stackx_data[host_mask][col]) / host_mask.sum() encoded.append(host_labels) encoded = pd.concat(encoded, sort=False).reindex(stackx_data.index) stackx_data[encoded.columns] = encoded log.write("Aux targets are normalized \n") #Train-Val Split train_df, test_df = train_test_split(stackx_data, test_size=0.1, random_state=args.seed) log.write(" Train-Val Split : train_df size %s \t val_df size is %s \n" % (str(train_df.shape), str(test_df.shape))) #tokenizer
def ranksum_thread(vec): """ Wilcoxon rank sum test for one feature Adapted from the following R functions: `Seurat::FindMarkers` and `stats::wilcox.test` """ # Preparation vec = vec.toarray().ravel() if issparse(vec) else vec pct = np.empty((group_onehot.shape[1], 2)) logfc = np.empty((group_onehot.shape[1])) for i in range(group_onehot.shape[1]): mask = group_onehot[:, i].ravel() pct[i, 0] = round(np.sum(vec[mask] > 0) / n_x[i], 3) pct[i, 1] = round(np.sum(vec[~mask] > 0) / n_y[i], 3) logfc[i] = np.log(vec[mask].mean() + pseudocount) - \ np.log(vec[~mask].mean() + pseudocount) # Percent expressed filtering pct_max = pct.max(axis=1) pct_min = pct.min(axis=1) pct_diff = pct_max - pct_min pct_mask = (pct_max > min_pct) & (pct_diff > min_pct_diff) # Fold change filtering if alternative == "greater": logfc_mask = logfc > logfc_threshold elif alternative == "less": logfc_mask = logfc < -logfc_threshold elif alternative == "two-sided": logfc_mask = abs(logfc) > logfc_threshold total_mask = pct_mask & logfc_mask if not np.any(total_mask): nan_placeholder = np.empty(group_onehot.shape[1]) nan_placeholder[:] = np.nan return pct[:, 0].ravel(), pct[:, 1].ravel(), logfc, \ nan_placeholder, nan_placeholder, nan_placeholder # Rank sum test rank = rankdata(vec) n_ties = np.unique(rank, return_counts=True)[1] stat = np.empty(group_onehot.shape[1]) for i in range(group_onehot.shape[1]): mask = group_onehot[:, i].ravel() if total_mask[i]: stat[i] = rank[mask].sum() - n_x[i] * (n_x[i] + 1) / 2 else: stat[i] = np.nan z = stat - n_x * n_y / 2 sigma = np.sqrt( (n_xy_prod / 12) * ((n_xy_plus + 1) - (n_ties**3 - n_ties).sum() / (n_xy_plus * (n_xy_plus - 1)))) if alternative == "two-sided": correction = np.sign(z) * 0.5 elif alternative == "greater": correction = 0.5 elif alternative == "less": correction = -0.5 z = (z - correction) / sigma if alternative == "two-sided": pval = 2 * np.stack([norm.sf(z), norm.cdf(z)], axis=0).min(axis=0) elif alternative == "greater": pval = norm.sf(z) elif alternative == "less": pval = norm.cdf(z) return pct[:, 0].ravel(), pct[:, 1].ravel(), logfc, stat, z, pval
def pbo(M, S, metric_func, threshold, n_jobs=1, verbose=False, plot=False, hist=False): ''' Based on http://papers.ssrn.com/sol3/papers.cfm?abstract_id=2326253 Features: * training and test sets are of equal size, providing comparable accuracy to both IS and OOS Sharpe ratios. * CSCV is symmetric, decline in performance can only result from overfitting, not arbitrary discrepancies between the training and test sets. * CSCV respects the time-dependence and other season-dependent features present in the data. * Results are deterministic, can be replicated. * Dispersion in the distribution of logits conveys relevant info regarding the robustness of the strategy selection process. * Model-free, non-parametric. Logits distribution resembles the cumulative Normal distribution if w_bar are close to uniform distribution (i.e. the backtest appears to be information-less). Therefore, for good backtesting, the distribution of logits will be centered in a significantly positive value, and its tail will marginally cover the region of negative logit values. Limitations: * CSCV is symmetric, for some strategies, K-fold CV might be better. * Not suitable for time series with strong auto-correlation, especially when S is large. * Assumes all the sample statistics carry the same weight. * Entirely possible that all the N strategy configs have high but similar Sharpe ratios. Therefore, PBO may appear high, however, 'overfitting' here is among many 'skilful' strategies. Parameters: M: returns data, numpy or dataframe format. S: chuncks to devided M into, must be even number. Paper suggests setting S = 16. See paper for details of choice of S. metric_func: evaluation function for returns data threshold: used as prob. of OOS Loss calculation cutoff. For Sharpe ratio, this should be 0 to indicate probabilty of loss. n_jobs: if greater than 1 then enable parallel mode hist: Default False, whether to plot histogram for rank of logits. Some problems exist when S >= 10. Need to look at why numpy / matplotlib does it. Returns: PBO result in namedtuple, instance of PBO. ''' if S % 2 == 1: raise ValueError('S must be an even integer, {:.1f} was given' .format(S)) n_jobs = int(n_jobs) if n_jobs < 0: n_jobs = max(1, ps.cpu_count(logical=False)) if isinstance(M, pd.DataFrame): # conver to numpy values if verbose: print('Convert from DataFrame to numpy array.') M = M.values # Paper suggests T should be 2x the no. of observations used by investor # to choose a model config, due to the fact that CSCV compares combinations # of T/2 observations with their complements. T, N = M.shape residual = T % S if residual != 0: M = M[residual:] T, N = M.shape sub_T = T // S if verbose: print('Total sample size: {:,d}, chunck size: {:,d}'.format(T, sub_T)) # generate subsets, each of length sub_T Ms = [] Ms_values = [] for i in range(S): start, end = i * sub_T, (i + 1) * sub_T Ms.append((i, M[start:end, :])) Ms_values.append(M[start:end, :]) Ms_values = np.array(Ms_values) if verbose: print('No. of Chuncks: {:,d}'.format(len(Ms))) # generate combinations Cs = [x for x in itr.combinations(Ms, S // 2)] if verbose: print('No. of combinations = {:,d}'.format(len(Cs))) # Ms_index used to find J_bar (complementary OOS part) Ms_index = set([x for x in range(len(Ms))]) # create J and J_bar if n_jobs < 2: J = [] J_bar = [] for i in range(len(Cs)): # make sure chucks are concatenated in their original order order = [x for x, _ in Cs[i]] sort_ind = np.argsort(order) Cs_values = np.array([v for _, v in Cs[i]]) # if verbose: # print('Cs index = {}, '.format(order), end='') joined = np.concatenate(Cs_values[sort_ind, :]) J.append(joined) # find Cs_bar Cs_bar_index = list(sorted(Ms_index - set(order))) # if verbose: # print('Cs_bar_index = {}'.format(Cs_bar_index)) J_bar.append(np.concatenate(Ms_values[Cs_bar_index, :])) # compute matrices for J and J_bar, e.g. Sharpe ratio R = [metric_func(j) for j in J] R_bar = [metric_func(j) for j in J_bar] # compute ranks of metrics R_rank = [ss.rankdata(x) for x in R] R_bar_rank = [ss.rankdata(x) for x in R_bar] # find highest metric, rn contains the index position of max value # in each set of R (IS) rn = [np.argmax(r) for r in R_rank] # use above index to find R_bar (OOS) in same index position # i.e. the same config / setting rn_bar = [R_bar_rank[i][rn[i]] for i in range(len(R_bar_rank))] # formula in paper used N+1 as the denominator for w_bar. w_bar = [float(r) / N for r in rn_bar] # logit(.5) gives 0 so if w_bar value is equal to median logits is 0 logits = [spec.logit(w) for w in w_bar] else: # use joblib for parallel calc # print('Run in parallel mode.') cores = job.Parallel(n_jobs=n_jobs)( job.delayed(pbo_core_calc)(Cs_x, Ms, Ms_values, Ms_index, metric_func, verbose) for Cs_x in Cs) # core_df = pd.DataFrame(cores, columns=PBOCore._fields) # convert to values needed. # # core_df = pd.DataFrame.from_records(cores) # J = core_df.J.values # J_bar = core_df.J_bar.values # R = core_df.R.values # R_bar = core_df.R_bar.values # R_rank = core_df.R_rank.values # R_bar_rank = core_df.R_bar_rank.values # rn = core_df.rn.values # rn_bar = core_df.rn_bar.values # w_bar = core_df.w_bar.values # logits = core_df.logits.values J = [c.J for c in cores] J_bar = [c.J_bar for c in cores] R = [c.R for c in cores] R_bar = [c.R_bar for c in cores] R_rank = [c.R_rank for c in cores] R_bar_rank = [c.R_bar_rank for c in cores] rn = [c.rn for c in cores] rn_bar = [c.rn_bar for c in cores] w_bar = [c.w_bar for c in cores] logits = [c.logits for c in cores] # prob of overfitting phi = np.array([1.0 if lam <= 0 else 0.0 for lam in logits]) / len(Cs) pbo_test = np.sum(phi) # performance degradation R_n_star = np.array([R[i][rn[i]] for i in range(len(R))]) R_bar_n_star = np.array([R_bar[i][rn[i]] for i in range(len(R_bar))]) lm = ss.linregress(x=R_n_star, y=R_bar_n_star) prob_oos_loss = np.sum([1.0 if r < threshold else 0.0 for r in R_bar_n_star]) / len(R_bar_n_star) # Stochastic dominance y = np.linspace(min(R_bar_n_star), max(R_bar_n_star), endpoint=True, num=1000) R_bar_n_star_cdf = smd.ECDF(R_bar_n_star) optimized = R_bar_n_star_cdf(y) R_bar_cdf = smd.ECDF(np.concatenate(R_bar)) non_optimized = R_bar_cdf(y) dom_df = pd.DataFrame(dict(optimized_IS=optimized, non_optimized_OOS=non_optimized)) dom_df.index = y # visually, non_optimized curve above optimized curve indicates good # backtest with low overfitting. dom_df['SD2'] = dom_df.non_optimized_OOS - dom_df.optimized_IS result = PBO(pbo_test, prob_oos_loss, lm, dom_df, Cs, J, J_bar, R, R_bar, R_rank, R_bar_rank, rn, rn_bar, w_bar, logits, R_n_star, R_bar_n_star) if plot: plot_pbo(result, hist=hist) return result
def rankSelection(self): #fitnesses = self.getFitnessPop() rank_fitnesses = rankdata(self.fitnesses) probs = [f / sum(rank_fitnesses) for f in rank_fitnesses] p1, p2 = np.random.choice(self.pop, 2, p=probs) return p1, p2
def prec_(request): if request.method == 'POST': checks = request.POST.getlist('checks_[]') target = request.POST['target_'] typedata = request.POST.get('typedata_') checks = list(map(int, checks)) target = int(target) print(typedata) #global test_df test_df = request.session['test_df'] x = test_df.iloc[:, checks].values y = test_df.iloc[:, target].values.astype('int64') train_df = request.session['train_df'] x_tr = train_df.iloc[:, checks].values y_tr = train_df.iloc[:, target].values #preprocessing: dtypes_list = list(test_df.dtypes) categorical_lst = [] for i in range(0, len(checks)): if (dtypes_list[checks[i]] == 'object'): categorical_lst.append(i) labelencoder = LabelEncoder() for i in categorical_lst: x[:, i] = labelencoder.fit_transform(x[:, i]) x_tr[:, i] = labelencoder.fit_transform(x_tr[:, i]) if len(categorical_lst) != 0: oneh = OneHotEncoder(categorical_features=categorical_lst) x = oneh.fit_transform(x).toarray() x_tr = oneh.fit_transform(x_tr).toarray() ''' avoid dummy variable ''' sc = StandardScaler() x = sc.fit_transform(x) x_tr = sc.fit_transform(x_tr) regList = request.session['regList'] regnames = request.session['regnames'] print(regnames) resultDict = { 'regs': len(regList), 'regnames': regnames, 'auc': [], 'tp': [], 'tn': [], 'fn': [], 'fp': [], 'accuracy': [], 'recall': [], 'precision': [], 'f1': [] } if (typedata == 'train'): x = x_tr y = y_tr for i in range(0, len(regList)): y_pred = regList[i].predict(x) cm = confusion_matrix(y, y_pred.round()) # predict probabilities probs = regList[i].predict_proba(x) # keep probabilities for the positive outcome only probs = probs[:, 1] auc_value = roc_auc_score(y, probs) resultDict['auc'].append(auc_value) resultDict['tp'].append(int(cm[0][0])) resultDict['tn'].append(int(cm[1][1])) resultDict['fn'].append(int(cm[0][1])) resultDict['fp'].append(int(cm[1][0])) accuracy = (cm[0][0] + cm[1][1]) / (cm[0][0] + cm[1][0] + cm[0][1] + cm[1][1]) resultDict['accuracy'].append(round(accuracy, 4)) recall = cm[0][0] / (cm[0][0] + cm[0][1]) precision = cm[0][0] / (cm[0][0] + cm[1][0]) resultDict['recall'].append(round(recall, 4)) resultDict['precision'].append(round(precision, 4)) resultDict['f1'].append( round((2 * (recall * precision) / (recall + precision)), 4)) #Next line to be used in savemodel function request.session['f1'] = resultDict['f1'] #TOPSIS arr = [] for i in range(0, resultDict['regs']): arr.append([]) arr[i].append(resultDict['tp'][i]) arr[i].append(resultDict['tn'][i]) arr[i].append(resultDict['fn'][i]) arr[i].append(resultDict['fp'][i]) arr[i].append(resultDict['accuracy'][i]) arr[i].append(resultDict['recall'][i]) arr[i].append(resultDict['precision'][i]) arr[i].append(resultDict['f1'][i]) print(arr) w = [1, 1, 1, 1, 1, 1, 1, 1] f = ['+', '+', '-', '-', '+', '+', '+', '+'] sqr = [] nm = [] #ds=pd.read_csv('topsis.csv') target1 = resultDict['regnames'] ord_arr = copy.deepcopy(arr) ds = arr rows = len(arr) cols = len(arr[0]) for i in range(0, cols): sum1 = 0 for j in range(0, rows): sum1 = sum1 + (ds[j][i] * ds[j][i]) sum1 = math.sqrt(sum1) sqr.append(sum1) sum2 = 0 for i in range(0, cols): sum2 = sum2 + w[i] for i in range(0, cols): w[i] = w[i] / sum2 for i in range(0, cols): for j in range(0, rows): ds[j][i] = (ds[j][i] / sqr[i]) * w[i] max1 = [] min1 = [] best = [] worst = [] for i in range(0, cols): max2 = -100000 min2 = 100000 for j in range(0, rows): if (ds[j][i] > max2): max2 = ds[j][i] if (ds[j][i] < min2): min2 = ds[j][i] if (f[i] == '+'): best.append(max2) worst.append(min2) elif (f[i] == '-'): best.append(min2) worst.append(max2) sip = [] sin = [] for i in range(0, rows): sumsip = 0 sumsin = 0 for j in range(0, cols): sumsip = sumsip + (ds[i][j] - best[j]) * (ds[i][j] - best[j]) sumsin = sumsin + (ds[i][j] - worst[j]) * (ds[i][j] - worst[j]) sip.append(math.sqrt(sumsip)) sin.append(math.sqrt(sumsin)) p = [] for i in range(0, rows): p.append(sin[i] / (sip[i] + sin[i])) print(p) #rank array, convert float to int, convert np arr to python list lst = (len(p) - ss.rankdata(p) + 1).astype(int).tolist() print(lst) x = np.array(lst) unq = np.unique(x) unq = unq.tolist() for i in unq: count = 0 for ind, j in enumerate(lst): if (i == j): lst[ind] = j + count count = count + 1 print(lst) resultDict['models'] = [] for j in range(0, len(lst)): i = lst.index(j + 1) resultDict['models'].append(ord_arr[i]) resultDict['models'][j].append(regnames[i]) #next line to pass origninal index so that it can be used when save model button clicked resultDict['models'][j].append(i) return JsonResponse(resultDict, status=200) else: return redirect('Link5')
def mantel_test(X, Y, perms=10000, method='pearson', tail='two-tail'): #Source: https://github.com/jwcarr/MantelTest/blob/master/Mantel.py """ Takes two distance matrices (either redundant matrices or condensed vectors) and performs a Mantel test. The Mantel test is a significance test of the correlation between two distance matrices. Parameters ---------- X : array_like First distance matrix (condensed or redundant). Y : array_like Second distance matrix (condensed or redundant), where the order of elements corresponds to the order of elements in the first matrix. perms : int, optional The number of permutations to perform (default: 10000). A larger number gives more reliable results but takes longer to run. If the actual number of possible permutations is smaller, the program will enumerate all permutations. Enumeration can be forced by setting this argument to 0. method : str, optional Type of correlation coefficient to use; either 'pearson' or 'spearman' (default: 'pearson'). tail : str, optional Which tail to test in the calculation of the empirical p-value; either 'upper', 'lower', or 'two-tail' (default: 'two-tail'). Returns ------- r : float Veridical correlation p : float Empirical p-value z : float Standard score (z-score) """ # Ensure that X and Y are formatted as Numpy arrays. X, Y = np.asarray(X, dtype=float), np.asarray(Y, dtype=float) # Check that X and Y are valid distance matrices. if spatial.distance.is_valid_dm(X) == False and spatial.distance.is_valid_y(X) == False: raise ValueError('X is not a valid condensed or redundant distance matrix') if spatial.distance.is_valid_dm(Y) == False and spatial.distance.is_valid_y(Y) == False: raise ValueError('Y is not a valid condensed or redundant distance matrix') # If X or Y is a redundant distance matrix, reduce it to a condensed distance matrix. if len(X.shape) == 2: X = spatial.distance.squareform(X, force='tovector', checks=False) if len(Y.shape) == 2: Y = spatial.distance.squareform(Y, force='tovector', checks=False) # Check for size equality. if X.shape[0] != Y.shape[0]: raise ValueError('X and Y are not of equal size') # Check for minimum size. if X.shape[0] < 3: raise ValueError('X and Y should represent at least 3 objects') # If Spearman correlation is requested, convert X and Y to ranks. if method == 'spearman': X, Y = stats.rankdata(X), stats.rankdata(Y) # Check for valid method parameter. elif method != 'pearson': raise ValueError('The method should be set to "pearson" or "spearman"') # Check for valid tail parameter. if tail != 'upper' and tail != 'lower' and tail != 'two-tail': raise ValueError('The tail should be set to "upper", "lower", or "two-tail"') # Now we're ready to start the Mantel test using a number of optimizations: # # 1. We don't need to recalculate the pairwise distances between the objects # on every permutation. They've already been calculated, so we can use a # simple matrix shuffling technique to avoid recomputing them. This works # like memoization. # # 2. Rather than compute correlation coefficients, we'll just compute the # covariances. This works because the denominator in the equation for the # correlation coefficient will yield the same result however the objects # are permuted, making it redundant. Removing the denominator leaves us # with the covariance. # # 3. Rather than permute the Y distances and derive the residuals to calculate # the covariance with the X distances, we'll represent the Y residuals in # the matrix and shuffle those directly. # # 4. If the number of possible permutations is less than the number of # permutations that were requested, we'll run a deterministic test where # we try all possible permutations rather than sample the permutation # space. This gives a faster, deterministic result. # Calculate the X and Y residuals, which will be used to compute the # covariance under each permutation. X_residuals, Y_residuals = X - X.mean(), Y - Y.mean() # Expand the Y residuals to a redundant matrix. Y_residuals_as_matrix = spatial.distance.squareform(Y_residuals, force='tomatrix', checks=False) # Get the number of objects. m = Y_residuals_as_matrix.shape[0] # Calculate the number of possible matrix permutations. n = np.math.factorial(m) # Initialize an empty array to store temporary permutations of Y_residuals. Y_residuals_permuted = np.zeros(Y_residuals.shape[0], dtype=float) # If the number of requested permutations is greater than the number of # possible permutations (m!) or the perms parameter is set to 0, then run a # deterministic Mantel test ... if perms >= n or perms == 0: # Initialize an empty array to store the covariances. covariances = np.zeros(n, dtype=float) # Enumerate all permutations of row/column orders and iterate over them. for i, order in enumerate(permutations(range(m))): # Take a permutation of the matrix. Y_residuals_as_matrix_permuted = Y_residuals_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. spatial.distance._distance_wrap.to_vector_from_squareform_wrap(Y_residuals_as_matrix_permuted, Y_residuals_permuted) # Compute and store the covariance. covariances[i] = (X_residuals * Y_residuals_permuted).sum() # ... otherwise run a stochastic Mantel test. else: # Initialize an empty array to store the covariances. covariances = np.zeros(perms, dtype=float) # Initialize an array to store the permutation order. order = np.arange(m) # Store the veridical covariance in 0th position... covariances[0] = (X_residuals * Y_residuals).sum() # ...and then run the random permutations. for i in range(1, perms): # Choose a random order in which to permute the rows and columns. np.random.shuffle(order) # Take a permutation of the matrix. Y_residuals_as_matrix_permuted = Y_residuals_as_matrix[order, :][:, order] # Condense the permuted version of the matrix. Rather than use # distance.squareform(), we call directly into the C wrapper for speed. spatial.distance._distance_wrap.to_vector_from_squareform_wrap(Y_residuals_as_matrix_permuted, Y_residuals_permuted) # Compute and store the covariance. covariances[i] = (X_residuals * Y_residuals_permuted).sum() # Calculate the veridical correlation coefficient from the veridical covariance. r = covariances[0] / np.sqrt((X_residuals ** 2).sum() * (Y_residuals ** 2).sum()) # Calculate the empirical p-value for the upper or lower tail. if tail == 'upper': p = (covariances >= covariances[0]).sum() / float(covariances.shape[0]) elif tail == 'lower': p = (covariances <= covariances[0]).sum() / float(covariances.shape[0]) elif tail == 'two-tail': p = (abs(covariances) >= abs(covariances[0])).sum() / float(covariances.shape[0]) # Calculate the standard score. #z = (covariances[0] - covariances.mean()) / covariances.std() return r, p
def updateErr(self, indx, error): for i in range(0, len(indx)): self.err[indx[i]] = math.sqrt(error[i]) r_err = ss.rankdata( self.err) # rank of the error from smallest (1) to largest self.prob = [1 / (len(r_err) - i + 1) for i in r_err]
def check_case(values, method, expected): r = rankdata(values, method=method) assert_array_equal(r, expected)
def ranked_mwr(mwr, mat, wcol): mat[:, wcol] = rankdata(mat[:, wcol], method='dense') return direct_mwr(mwr, mat, wcol)
def rolling_rank(np_data): return rankdata(np_data, method='min')[-1]
def fitmodels_direct(catd, mmix, mask, t2s, t2s_full, tes, combmode, ref_img, reindex=False, mmixN=None, full_sel=True, label=None, out_dir='.', verbose=False): """ Fit TE-dependence and -independence models to components. Parameters ---------- catd : (S x E x T) array_like Input data, where `S` is samples, `E` is echos, and `T` is time mmix : (T x C) array_like Mixing matrix for converting input data to component space, where `C` is components and `T` is the same as in `catd` mask : (S [x E]) array_like Boolean mask array t2s : (S [x T]) array_like Limited T2* map or timeseries. t2s_full : (S [x T]) array_like Full T2* map or timeseries. For voxels with good signal in only one echo, which are zeros in the limited T2* map, this map uses the T2* estimate using the first two echoes. tes : list List of echo times associated with `catd`, in milliseconds combmode : {'t2s', 'ste'} str How optimal combination of echos should be made, where 't2s' indicates using the method of Posse 1999 and 'ste' indicates using the method of Poser 2006 ref_img : str or img_like Reference image to dictate how outputs are saved to disk reindex : bool, optional Default: False mmixN : array_like, optional Default: None full_sel : bool, optional Whether to perform selection of components based on Rho/Kappa scores. Default: True Returns ------- seldict : dict comptab : (N x 5) :obj:`pandas.DataFrame` Array with columns denoting (1) index of component, (2) Kappa score of component, (3) Rho score of component, (4) variance explained by component, and (5) normalized variance explained by component betas : :obj:`numpy.ndarray` mmix_new : :obj:`numpy.ndarray` """ if not (catd.shape[0] == t2s.shape[0] == t2s_full.shape[0] == mask.shape[0]): raise ValueError('First dimensions (number of samples) of catd ({0}), ' 't2s ({1}), and mask ({2}) do not ' 'match'.format(catd.shape[0], t2s.shape[0], mask.shape[0])) elif catd.shape[1] != len(tes): raise ValueError('Second dimension of catd ({0}) does not match ' 'number of echoes provided (tes; ' '{1})'.format(catd.shape[1], len(tes))) elif catd.shape[2] != mmix.shape[0]: raise ValueError('Third dimension (number of volumes) of catd ({0}) ' 'does not match first dimension of ' 'mmix ({1})'.format(catd.shape[2], mmix.shape[0])) elif t2s.shape != t2s_full.shape: raise ValueError('Shape of t2s array {0} does not match shape of ' 't2s_full array {1}'.format(t2s.shape, t2s_full.shape)) elif t2s.ndim == 2: if catd.shape[2] != t2s.shape[1]: raise ValueError('Third dimension (number of volumes) of catd ' '({0}) does not match second dimension of ' 't2s ({1})'.format(catd.shape[2], t2s.shape[1])) mask = t2s != 0 # Override mask because problems # compute optimal combination of raw data tsoc = combine.make_optcom(catd, tes, mask, t2s=t2s_full, combmode=combmode, verbose=False).astype(float)[mask] # demean optimal combination tsoc_dm = tsoc - tsoc.mean(axis=-1, keepdims=True) # compute un-normalized weight dataset (features) if mmixN is None: mmixN = mmix WTS = computefeats2(utils.unmask(tsoc, mask), mmixN, mask, normalize=False) # compute PSC dataset - shouldn't have to refit data tsoc_B = get_coeffs(tsoc_dm, mmix, mask=None) tsoc_Babs = np.abs(tsoc_B) PSC = tsoc_B / tsoc.mean(axis=-1, keepdims=True) * 100 # compute skews to determine signs based on unnormalized weights, # correct mmix & WTS signs based on spatial distribution tails signs = stats.skew(WTS, axis=0) signs /= np.abs(signs) mmix = mmix.copy() mmix *= signs WTS *= signs PSC *= signs totvar = (tsoc_B**2).sum() totvar_norm = (WTS**2).sum() # compute Betas and means over TEs for TE-dependence analysis betas = get_coeffs(catd, mmix, np.repeat(mask[:, np.newaxis], len(tes), axis=1)) n_samp, n_echos, n_components = betas.shape n_voxels = mask.sum() n_data_voxels = (t2s != 0).sum() mu = catd.mean(axis=-1, dtype=float) tes = np.reshape(tes, (n_echos, 1)) fmin, _, _ = utils.getfbounds(n_echos) # mask arrays mumask = mu[t2s != 0] t2smask = t2s[t2s != 0] betamask = betas[t2s != 0] # set up Xmats X1 = mumask.T # Model 1 X2 = np.tile(tes, (1, n_data_voxels)) * mumask.T / t2smask.T # Model 2 # tables for component selection kappas = np.zeros([n_components]) rhos = np.zeros([n_components]) varex = np.zeros([n_components]) varex_norm = np.zeros([n_components]) Z_maps = np.zeros([n_voxels, n_components]) F_R2_maps = np.zeros([n_data_voxels, n_components]) F_S0_maps = np.zeros([n_data_voxels, n_components]) Z_clmaps = np.zeros([n_voxels, n_components]) F_R2_clmaps = np.zeros([n_data_voxels, n_components]) F_S0_clmaps = np.zeros([n_data_voxels, n_components]) Br_R2_clmaps = np.zeros([n_voxels, n_components]) Br_S0_clmaps = np.zeros([n_voxels, n_components]) pred_R2_maps = np.zeros([n_data_voxels, n_echos, n_components]) pred_S0_maps = np.zeros([n_data_voxels, n_echos, n_components]) LGR.info('Fitting TE- and S0-dependent models to components') for i_comp in range(n_components): # size of B is (n_echoes, n_samples) B = np.atleast_3d(betamask)[:, :, i_comp].T alpha = (np.abs(B)**2).sum(axis=0) varex[i_comp] = (tsoc_B[:, i_comp]**2).sum() / totvar * 100. varex_norm[i_comp] = (utils.unmask(WTS, mask)[t2s != 0][:, i_comp]**2).sum() /\ totvar_norm * 100. # S0 Model # (S,) model coefficient map coeffs_S0 = (B * X1).sum(axis=0) / (X1**2).sum(axis=0) pred_S0 = X1 * np.tile(coeffs_S0, (n_echos, 1)) pred_S0_maps[:, :, i_comp] = pred_S0.T SSE_S0 = (B - pred_S0)**2 SSE_S0 = SSE_S0.sum(axis=0) # (S,) prediction error map F_S0 = (alpha - SSE_S0) * (n_echos - 1) / (SSE_S0) F_S0_maps[:, i_comp] = F_S0 # R2 Model coeffs_R2 = (B * X2).sum(axis=0) / (X2**2).sum(axis=0) pred_R2 = X2 * np.tile(coeffs_R2, (n_echos, 1)) pred_R2_maps[:, :, i_comp] = pred_R2.T SSE_R2 = (B - pred_R2)**2 SSE_R2 = SSE_R2.sum(axis=0) F_R2 = (alpha - SSE_R2) * (n_echos - 1) / (SSE_R2) F_R2_maps[:, i_comp] = F_R2 # compute weights as Z-values wtsZ = (WTS[:, i_comp] - WTS[:, i_comp].mean()) / WTS[:, i_comp].std() wtsZ[np.abs(wtsZ) > Z_MAX] = ( Z_MAX * (np.abs(wtsZ) / wtsZ))[np.abs(wtsZ) > Z_MAX] Z_maps[:, i_comp] = wtsZ # compute Kappa and Rho F_S0[F_S0 > F_MAX] = F_MAX F_R2[F_R2 > F_MAX] = F_MAX norm_weights = np.abs( np.squeeze(utils.unmask(wtsZ, mask)[t2s != 0]**2.)) kappas[i_comp] = np.average(F_R2, weights=norm_weights) rhos[i_comp] = np.average(F_S0, weights=norm_weights) # tabulate component values comptab = np.vstack([kappas, rhos, varex, varex_norm]).T if reindex: # re-index all components in Kappa order sort_idx = comptab[:, 0].argsort()[::-1] comptab = comptab[sort_idx, :] mmix_new = mmix[:, sort_idx] betas = betas[..., sort_idx] pred_R2_maps = pred_R2_maps[:, :, sort_idx] pred_S0_maps = pred_S0_maps[:, :, sort_idx] F_S0_maps = F_S0_maps[:, sort_idx] F_R2_maps = F_R2_maps[:, sort_idx] Z_maps = Z_maps[:, sort_idx] WTS = WTS[:, sort_idx] PSC = PSC[:, sort_idx] tsoc_B = tsoc_B[:, sort_idx] tsoc_Babs = tsoc_Babs[:, sort_idx] else: mmix_new = mmix if verbose: # Echo-specific weight maps for each of the ICA components. io.filewrite(betas, op.join(out_dir, label + 'betas_catd.nii'), ref_img) # Echo-specific maps of predicted values for R2 and S0 models for each # component. io.filewrite(utils.unmask(pred_R2_maps, mask), op.join(out_dir, label + 'R2_pred.nii'), ref_img) io.filewrite(utils.unmask(pred_S0_maps, mask), op.join(out_dir, label + 'S0_pred.nii'), ref_img) # Weight maps used to average metrics across voxels io.filewrite(utils.unmask(Z_maps**2., mask), op.join(out_dir, label + 'metric_weights.nii'), ref_img) comptab = pd.DataFrame(comptab, columns=[ 'kappa', 'rho', 'variance explained', 'normalized variance explained' ]) comptab.index.name = 'component' # full selection including clustering criteria seldict = None if full_sel: LGR.info('Performing spatial clustering of components') csize = np.max([int(n_voxels * 0.0005) + 5, 20]) LGR.debug('Using minimum cluster size: {}'.format(csize)) for i_comp in range(n_components): # save out files out = np.zeros((n_samp, 4)) out[:, 0] = np.squeeze(utils.unmask(PSC[:, i_comp], mask)) out[:, 1] = np.squeeze(utils.unmask(F_R2_maps[:, i_comp], t2s != 0)) out[:, 2] = np.squeeze(utils.unmask(F_S0_maps[:, i_comp], t2s != 0)) out[:, 3] = np.squeeze(utils.unmask(Z_maps[:, i_comp], mask)) ccimg = io.new_nii_like(ref_img, out) # Do simple clustering on F sel = spatclust(ccimg, min_cluster_size=csize, threshold=fmin, index=[1, 2], mask=(t2s != 0)) F_R2_clmaps[:, i_comp] = sel[:, 0] F_S0_clmaps[:, i_comp] = sel[:, 1] countsigFR2 = F_R2_clmaps[:, i_comp].sum() countsigFS0 = F_S0_clmaps[:, i_comp].sum() # Do simple clustering on Z at p<0.05 sel = spatclust(ccimg, min_cluster_size=csize, threshold=1.95, index=3, mask=mask) Z_clmaps[:, i_comp] = sel # Do simple clustering on ranked signal-change map spclust_input = utils.unmask(stats.rankdata(tsoc_Babs[:, i_comp]), mask) spclust_input = io.new_nii_like(ref_img, spclust_input) Br_R2_clmaps[:, i_comp] = spatclust( spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape) - countsigFR2, mask=mask) Br_S0_clmaps[:, i_comp] = spatclust( spclust_input, min_cluster_size=csize, threshold=max(tsoc_Babs.shape) - countsigFS0, mask=mask) seldict = {} selvars = [ 'WTS', 'tsoc_B', 'PSC', 'Z_maps', 'F_R2_maps', 'F_S0_maps', 'Z_clmaps', 'F_R2_clmaps', 'F_S0_clmaps', 'Br_R2_clmaps', 'Br_S0_clmaps' ] for vv in selvars: seldict[vv] = eval(vv) return seldict, comptab, betas, mmix_new
# plt.plot(prop_average_scores, 'b--^',label='PI_avg_score') # plt.legend(bbox_to_anchor=(0.7, 1)) # plt.xlabel('proposal number') # plt.ylabel('scores') # plt.show() ######################## Step 3, Generate global ranking ################# #generate score matrix and rank matrix for the assignment simulated_assignment_scores=np.zeros((n,m)) simulated_assignment_ranks=np.zeros((n,m)) for i in range(n): for j in range(m): ##fill in the simulated scores for the successful assignment simulated_assignment_scores[i,j]=prop_scores[i,assignment[i,j]] ##Assign ranks to assignment_scores, dealing with ties appropriately simulated_assignment_ranks[i]=rankdata(simulated_assignment_scores[i])-1 prop_total_scores=np.zeros(n) MBC_total_scores=np.zeros(n) for i in range(n): #find index of assignment for each proposal find_index=np.where(assignment==i) prop_index=np.asarray(find_index).T.tolist() for (r,v) in prop_index: ##summarize simulated ranks from different PIs for each proposal prop_total_scores[i]+=simulated_assignment_ranks[r,v] MBC_total_scores[i]=prop_total_scores[i] / (m*(m-1)) prop_global_ranks=rankdata(MBC_total_scores)-1