def correlation(mat_file_1, mat_file_2): """ Draws the plot """ blockades_1 = read_mat(mat_file_1) blockades_1 = sp._fractional_blockades(blockades_1) blockades_1 = sp._filter_by_duration(blockades_1, 0.5, 20) blockades_1 = map(lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_1) blockades_2 = read_mat(mat_file_2) blockades_2 = sp._fractional_blockades(blockades_2) blockades_2 = sp._filter_by_duration(blockades_2, 0.5, 20) blockades_2 = map(lambda b: sp.discretize(sp._trim_flank_noise(b.eventTrace), 20), blockades_2) self_corr = [] cross_corr = [] for blockade in blockades_1: block_self = [] for other in blockades_1: block_self.append(1 - distance.correlation(blockade, other)) block_cross = [] for other in blockades_2: block_cross.append(1 - distance.correlation(blockade, other)) self_corr.append(np.mean(block_self)) cross_corr.append(np.mean(block_cross)) mean_self = np.median(self_corr) mean_cross = np.median(cross_corr) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(-0.6, 0.6) fig.set_ylim(-0.6, 0.6) fig.set_xlabel("(H3 tail, H3 tail) correlation") fig.set_ylabel("(H3 tail, CCL5) correlation") for y in [-0.4, -0.2, 0, 0.2, 0.4]: plt.plot((-0.6, 0.6), (y, y), "--", lw=0.5, color="black") plt.plot((y, y), (-0.6, 0.6), "--", lw=0.5, color="black") plt.plot((-0.6, 0.6), (mean_cross, mean_cross), "--", lw=1.5, color="red") plt.plot((mean_self, mean_self), (-0.6, 0.6), "--", lw=1.5, color="red") fig.scatter(self_corr, cross_corr, linewidth=0.5, c="dodgerblue", s=30, edgecolor="blue") plt.tight_layout() plt.show()
def alignment(agent_a_before, agent_b_before, agent_a_after, agent_b_after): """Change in correlation distance between two agents.""" d_before = dist.correlation( dist.squareform(agent_a_before.op.graph.adj), dist.squareform(agent_b_before.op.graph.adj), ) d_after = dist.correlation( dist.squareform(agent_a_after.op.graph.adj), dist.squareform(agent_b_after.op.graph.adj), ) return -1 * (d_after - d_before)
def seqcor(m1, m2, seq=None): """Calculates motif similarity based on Pearson correlation of scores. Based on Kielbasa (2015) and Grau (2015). Scores are calculated based on scanning a de Bruijn sequence of 7-mers. This sequence is taken from ShortCAKE (Orenstein & Shamir, 2015). Optionally another sequence can be given as an argument. Parameters ---------- m1 : Motif instance Motif 1 to compare. m2 : Motif instance Motif 2 to compare. seq : str, optional Sequence to use for scanning instead of k=7 de Bruijn sequence. Returns ------- score, position, strand """ l1 = len(m1) l2 = len(m2) l = max(l1, l2) if seq is None: seq = RCDB L = len(seq) # Scan RC de Bruijn sequence result1 = pfmscan(seq, m1.pwm, m1.pwm_min_score(), len(seq), False, True) result2 = pfmscan(seq, m2.pwm, m2.pwm_min_score(), len(seq), False, True) # Reverse complement of motif 2 result3 = pfmscan(seq, m2.rc().pwm, m2.rc().pwm_min_score(), len(seq), False, True) result1 = np.array(result1) result2 = np.array(result2) result3 = np.array(result3) # Return maximum correlation c = [] for i in range(l1 - l1 // 3): c.append([1 - distance.correlation(result1[:L-l-i],result2[i:L-l]), i, 1]) c.append([1 - distance.correlation(result1[:L-l-i],result3[i:L-l]), i, -1]) for i in range(l2 - l2 // 3): c.append([1 - distance.correlation(result1[i:L-l],result2[:L-l-i]), -i, 1]) c.append([1 - distance.correlation(result1[i:L-l],result3[:L-l-i]), -i, -1]) return sorted(c, key=lambda x: x[0])[-1]
def cor_dist(a, b): """ Calculates the correlation coefficient distance between a (list of) vector(s) b and reference vector a :param a: A single query image :param b: One or more reference images :return: """ a = a.flatten() if isinstance(b, list): return [correlation(a, img.flatten()) for img in b] return correlation(a, b.flatten())
def smaf(X, d, lda1, lda2, maxItr=10, UW=None, posW=False, posU=True, use_chol=False, module_lower=500, activity_lower=5, donorm=False, mode=1, mink=5, U0=[], U0_delta=0.1, doprint=False): # use Cholesky when we expect a very sparse result # this tends to happen more on the full vs subsampled matrices if UW == None: U, W = spams.nmf(np.asfortranarray(X), return_lasso=True, K=d, numThreads=THREADS) W = np.asarray(W.todense()) else: U, W = UW Xhat = U.dot(W) Xnorm = np.linalg.norm(X) ** 2 / X.shape[1] for itr in range(maxItr): if mode == 1: # In this mode the ldas correspond to an approximate desired fit # Higher lda will be a worse fit, but will result in a sparser sol'n U = spams.lasso(np.asfortranarray(X.T), D=np.asfortranarray(W.T), lambda1=lda2 * Xnorm, mode=1, numThreads=THREADS, cholesky=use_chol, pos=posU) U = np.asarray(U.todense()).T elif mode == 2: if len(U0) > 0: U = projected_grad_desc(W.T, X.T, U.T, U0.T, lda2, U0_delta, maxItr=400) U = U.T else: U = spams.lasso(np.asfortranarray(X.T), D=np.asfortranarray(W.T), lambda1=lda2, lambda2=0.0, mode=2, numThreads=THREADS, cholesky=use_chol, pos=posU) U = np.asarray(U.todense()).T if donorm: U = U / np.linalg.norm(U, axis=0) U[np.isnan(U)] = 0 if mode == 1: wf = (1 - lda2) W = sparse_decode(X, U, lda1, worstFit=wf, mink=mink) elif mode == 2: if len(U0) > 0: W = projected_grad_desc(U, X, W, [], lda1, 0., nonneg=posW, maxItr=400) else: W = spams.lasso(np.asfortranarray(X), D=np.asfortranarray(U), lambda1=lda1, lambda2=1.0, mode=2, numThreads=THREADS, cholesky=use_chol, pos=posW) W = np.asarray(W.todense()) Xhat = U.dot(W) module_size = np.average([np.exp(entropy(u)) for u in U.T if u.sum() > 0]) activity_size = np.average([np.exp(entropy(abs(w))) for w in W.T]) if doprint: print distance.correlation(X.flatten(), Xhat.flatten()), module_size, activity_size, lda1, lda2 if module_size < module_lower: lda2 /= 2. if activity_size < activity_lower: lda2 /= 2. return U, W
def get_feat(trainDTMatirix, male, female): featMatrix = [] for i in range (0, trainDTMatirix.shape[0]): tempfeat = [] tempfeat.append(correlation(male,trainDTMatirix[i,:].tolist()[0])) tempfeat.append(cosine(male,trainDTMatirix[i,:].tolist()[0])) tempfeat.append(euclidean(male,trainDTMatirix[i,:].tolist()[0])) tempfeat.append(correlation(female,trainDTMatirix[i,:].tolist()[0])) tempfeat.append(cosine(female,trainDTMatirix[i,:].tolist()[0])) tempfeat.append(euclidean(female,trainDTMatirix[i,:].tolist()[0])) featMatrix.append(tempfeat) featMatrix = numpy.matrix(featMatrix) featMatrix = numpy.nan_to_num(featMatrix) trainDTMatirix = featMatrix return trainDTMatirix
def cosine_similarity(number_of_recomm, user_input_movies, user_input_ratings): #Create the mean-filled dense_matrix dense_matrix = create_dense() #Create the user array mean_rating_1 = mean_rating() user = np.repeat(mean_rating_1, dense_matrix.shape[1]) user_df = pd.DataFrame([user], columns = dense_matrix.columns) #Collect user input user_mov_index = convert_user_input(user_input_movies, user_input_ratings) #Impute user ratings for mov_id in user_mov_index: user_df[mov_id[0]] = mov_id[1] #Append it to the original user_movie_matrix dense_matrix_user = pd.concat([dense_matrix, user_df], ignore_index=False) #Create user-user sparse matrix UU = np.zeros((len(dense_matrix_user), len(dense_matrix_user))) UU = pd.DataFrame(UU, index=dense_matrix_user.index, columns=dense_matrix_user.index) # calculate pairwise similarities u = 0 for v in UU.columns: # 2. step: calculate similarities UU.loc[u, v] = 1-distance.correlation(dense_matrix_user.loc[u], dense_matrix_user.loc[v]) active_user = 0 # find similarities for active_user and sort it, take 1 to 5 entries # entry at 0 contains the similrity with itself neighbors = UU.loc[active_user].sort_values(ascending=False)[1:6] #Final matrix neighbors_m = dense_matrix_user.loc[neighbors.index] #Take the first user and suggest movies that person liked random_mov = np.random.randint(6) movies_list = list(neighbors_m.iloc[random_mov].sort_values(ascending = False) .head(number_of_recomm).index.map(movie_id_dict)) return movies_list
def correlation(x, y): try: return distance.correlation(x, y) except ValueError: return np.NaN except: return np.NaN
def get_nearest_neighbor(self, x_test, k, sample_class): distances = [] targets_index = [] for i in range(len(sample_class)): if (sample_class[i][:] != x_test).any(): if self.distance_calculator == 'jaccard': distance = dis.jaccard(x_test, sample_class[i][:]) elif self.distance_calculator == 'dice': distance = dis.dice(x_test, sample_class[i][:]) elif self.distance_calculator == 'correlation': distance = dis.correlation(x_test, sample_class[i][:]) elif self.distance_calculator == 'yule': distance = dis.yule(x_test, sample_class[i][:]) elif self.distance_calculator == 'russelo-rao': distance = dis.russellrao(x_test, sample_class[i][:]) elif self.distance_calculator == 'sokal-michener': distance = dis.sokalmichener(x_test, sample_class[i][:]) elif self.distance_calculator == 'rogers-tanimoto': distance = dis.rogerstanimoto(x_test, sample_class[i][:]) elif self.distance_calculator == 'kulzinsky': distance = dis.kulsinski(x_test, sample_class[i][:]) distances.append([distance, i]) # make a list of the k neighbors' targets distances.sort() for i in range(k): targets_index.append(distances[i][1]) return targets_index
def isADoor(contX, contY): contXNew, contYNew = getNewXAndY(contX, contY, len(x)) # contArr = np.divide(contXNew, contYNew) # print(contArr) # corr = ssd.correlation(ynew, contYNew) # spear = ss.spearmanr(ynew, contYNew) # pearson = np.correlate(ynew, contYNew, mode='valid') # corr = ssd.correlation(y, contYNew) # spear = ss.spearmanr(y, contYNew) # pearson = np.correlate(y, contYNew, mode='valid') # corr = ssd.correlation(templateArr, contArr) # spear = ss.spearmanr(templateArr, contArr) # pearson = np.correlate(templateArr, contArr) deltaX, deltaY = genDeltaXAndY(x,y) contDeltaX, contDeltaY = genDeltaXAndY(contXNew, contYNew) corr = ssd.correlation(deltaY, contDeltaY) spear = ss.spearmanr(deltaY, contDeltaY) pearson = np.correlate(deltaY, contDeltaY, mode='valid') # print(corr, corr**2,spear, pearson), # plt.figure() # plt.plot(x, y, 'r', contXNew, contYNew, 'g') # plt.plot(deltaX, deltaY, 'r', contDeltaX, contDeltaY, 'g') # plt.show() # global i # plt.savefig("RadialProfiles/GraphComp"+str(i)+".png") # i += 1 # plt.clf() return corr < 0.1
def test_embeddingset_plot_arrow_emb_axis_with_different_axis_metric(embset): fig, ax = mpl.pyplot.subplots() embset.plot( kind="arrow", x_axis=embset["blue"], y_axis="red", axis_metric=[scipy_distance.correlation, "cosine_similarity"], x_label="xx", color="magenta", ) vectors = [] for emb in embset.embeddings.values(): vectors.append([ scipy_distance.correlation(emb.vector, embset["blue"].vector), 1.0 - scipy_distance.cosine(emb.vector, embset["red"].vector), ]) vectors = np.array(vectors) props = { "type": mpl.collections.PolyCollection, "data": vectors, "x_label": "xx", "y_label": "red", "title": "", "label": list(embset.embeddings.keys()), "color": mpl.colors.to_rgba_array("magenta"), "aspect": "auto", } UV = np.concatenate( (ax.collections[1].U[:, None], ax.collections[1].V[:, None]), axis=-1) assert isinstance(ax.collections[1], props["type"]) assert np.array_equal(UV, props["data"]) assert [t.get_text() for t in ax.texts] == props["label"] assert np.array_equal(ax.collections[1].get_facecolors(), props["color"]) validate_plot_general_properties(ax, props) mpl.pyplot.close(fig)
def evaluate_continue_change(con_distri_features,soft_add,software): #roc_auc_DF=evaluation_ranks(passed_qc_sc_DF_cond,soft_add,software,UBI=UBIs[1:5]) #plot_evaluate_heat(passed_qc_sc_DF_RO,soft_add,con_distri_features,software,UBIs) if software=='multi-metric': passed_qc_sc_DF=pd.read_table(soft_add,header=0,index_col=0) phenotime=passed_qc_sc_DF[['ord']] elif (software=='wishbone') | (software=='CIRCLET'): phenotime=pd.read_table(soft_add,header=None,index_col=0) phenotime.columns=['Pseudotime'] ordIndex=phenotime.sort_values(by='Pseudotime') old_sc_name=ordIndex.index[-1] sc_name=ordIndex.index[0] corr_list=list() for sc_name in ordIndex.index: x=con_distri_features.loc[old_sc_name] y=con_distri_features.loc[sc_name] old_sc_name=sc_name #temp=stats.pearsonr(x,y)[0] #temp=distance.cosine(x,y) #temp=np.abs(distance.cosine(x,y)-1) temp=np.abs(distance.correlation(x,y)-1) corr_list.append(temp) evaluation_value=np.mean(corr_list) #print(evaluation_value) return evaluation_value
def Dist(array1, array2, dist): if dist == 'braycurtis': return distance.braycurtis(array1, array2) elif dist == 'correlation': return distance.correlation(array1, array2) elif dist == 'mahalanobis': return distance.mahalanobis(array1, array2) elif dist == 'minkowski': return distance.minkowski(array1, array2) elif dist == 'seuclidean': return distance.seuclidean(array1, array2) elif dist == 'sqeuclidean': return distance.sqeuclidean(array1, array2) elif dist == 'pearsonp': r, p = pearsonr(array1, array2) return p elif dist == 'pearsonr': r, p = pearsonr(array1, array2) return r elif dist == 'spearmanp': r, p = spearmanr(array1, array2) return p elif dist == 'spearmanr': r, p = spearmanr(array1, array2) return r
def _dodetect(self): dist = np.zeros((self.img1.shape[0], self.img1.shape[1])) for i in range(self.img1.shape[0]): for j in range(self.img1.shape[1]): dist[i, j] = distance.correlation(self.img1[i, j, :], self.img2[i, j, :]) self.change = dist
def computeDistance(func: str, Vi: typ.List[float], Vj: typ.List[float]) -> typ.Union[float, int]: """ Computes the distance using the provided distance function. :param func: Lowercase string name of the function to use. :param Vi: First 1d vector :param Vj: Second 1d vector :type func: str :type Vi: typ.List[float] :type Vj: typ.List[float] :return: vector of distance values :rtype: typ.Union[float, int] """ if func == "czekanowski": # if the function provided was Czekanowski, return __Czekanowski(Vi, Vj) elif func == "euclidean": # if the euclidean distance was requested return __Euclidean(Vi, Vj) elif func == "correlation": # if the correlation distance/value was requested return sp.correlation(Vi, Vj) elif func == "cosine": # if the cosine similarity function was requested # NOTE: this computes the distance, to compute similarity subtract result from 1 return sp.cosine(Vi, Vj) else: # if no valid distance function was provided, default to the euclidean distance return __Euclidean(Vi, Vj)
def calculateL2(self, feat1, feat2, c_type='euclidean'): assert np.shape(feat1) == np.shape(feat2) if config.insight: [ len_, ] = np.shape(feat1) #print(np.shape(feat1)) else: _, len_ = np.shape(feat1) #print("len ",len_) if c_type == "cosine": s_d = distance.cosine(feat1, feat2) elif c_type == "euclidean": #s_d = np.sqrt(np.sum(np.square(feat1-feat2))) #s_d = distance.euclidean(feat1,feat2,w=1./len_) s_d = distance.euclidean(feat1, feat2, w=1) elif c_type == "correlation": s_d = distance.correlation(feat1, feat2) elif c_type == "braycurtis": s_d = distance.braycurtis(feat1, feat2) elif c_type == 'canberra': s_d = distance.canberra(feat1, feat2) elif c_type == "chebyshev": s_d = distance.chebyshev(feat1, feat2) return s_d
def correlate(self, preFeature, curFeature): #return a correlation score between img1 and img2. The higher the better! #Feature for VGG return 1 - correlation(preFeature, curFeature) return cv2.compareHist(preFeature, curFeature, cv2.HISTCMP_CORREL)
def get_most_similar(v1,res_features): dist,best = float("inf"),None for i in res_features: if v1!=res_features[i]: distancia = distance.correlation(v1,res_features[i]) if distancia<dist: dist,best = distancia,i return best
def NewsToTweetsScor_pair(newsVecList, newsWordList, tweetVecList, tweetWordList, scoreFile): print 'Score pair wise start' newsVecList_len = len(newsVecList) tweetVecList_len = len(tweetVecList) total_dist = [] for i in range(newsVecList_len): u = newsVecList[i] print i, ' = ', u_to_v = [] for j in range(tweetVecList_len): v = tweetVecList[j] val = distance.cosine(u, v) val += distance.euclidean(u, v) val += distance.dice(u, v) val += distance.correlation(u, v) val += distance.jaccard(u, v) val += distance.cityblock(u, v) val = val / 6.0 u_to_v.append(val) total_dist.append(u_to_v) print 'pair wise end' return total_dist
def kmeans_classify(d, means, metric="Euclidean"): ids = [0] * d.shape[0] squared_dis = [float("inf")] * d.shape[0] distances = [float("inf")] * d.shape[0] for i in range(d.shape[0]): for j in range(means.shape[0]): if metric == "Euclidean": dis = distance.euclidean(d[i], means[j]) if dis <= distances[i]: distances[i] = dis ids[i] = j elif metric == "L1-Norm": dis = distance.cityblock(d[i], means[j]) if dis <= distances[i]: distances[i] = dis ids[i] = j elif metric == "Hamming": dis = distance.hamming(d[i], means[j]) if dis <= distances[i]: distances[i] = dis ids[i] = j elif metric == "Correlation": dis = distance.correlation(d[i], means[j]) if dis <= distances[i]: distances[i] = dis ids[i] = j elif metric == "Cosine": dis = distance.cosine(d[i], means[j]) if dis <= distances[i]: distances[i] = dis ids[i] = j return np.matrix(ids).reshape(d.shape[0], 1), np.matrix(distances).reshape( d.shape[0], 1)
def correlation_am_word2vec(self, row): try: if row['id'] % 10000 == 0: elapsed = time.time() - start_time print("Processed {:10.0f} questions in {:10.0f} s ".format( row['id'], elapsed)) except KeyError: if row['test_id'] % 10000 == 0: elapsed = time.time() - start_time print("Processed {:10.0f} questions in {:10.0f} s ".format( row['test_id'], elapsed)) q1 = self.getWordVecs(row['question1']) q2 = self.getWordVecs(row['question2']) if len(q1) == 0 or len(q2) == 0: return 0 q1_vec = np.zeros(300) q2_vec = np.zeros(300) for word in q1: q1_vec += self.wordvecs[word] q1_vec /= len(q1) for word in q2: q2_vec += self.wordvecs[word] q2_vec /= len(q2) score = correlation(q1_vec, q2_vec) return score
def profile_sim( prof1: Iterable[float], prof2: Iterable[float], ) -> float: """Calculates the similarity of two activity_profiles of the same length. The profiles are compared by distance correlation ``scipy.spatial.distance.correlation()`` (same as Pearson correlation). Parameters: =========== prof1: The first profile to compare. prof2: The second profile to compare. The two profiles have to be of equal length. Returns: ======== Similarity value between 0.0 .. 1.0 (0.0 being very dissimilar and 1.0 identical).""" assert len(prof1) == len( prof2), "Activity Profiles must have the same length to be compared." if not isinstance(prof1, np.ndarray): prof1 = np.array(prof1) prof1 = np.clip(prof1, -25.0, 25.0) if not isinstance(prof2, np.ndarray): prof2 = np.array(prof2) prof2 = np.clip(prof2, -25.0, 25.0) result = 1 - dist.correlation(prof1, prof2) if np.isnan(result) or result < 0.0: result = 0.0 return result
def compare_stability_matrices(ism_a, ism_b): """ Calculate the distance between two different stability maps Parameters ---------- ism_a : array_like A numpy stability matrix of shape (`V`, `V`), `V` voxels. ism_b : array_like A numpy stability matrix of shape (`V`, `V`), `V` voxels. Returns ------- similarity : array_like The distance between the two input matrices. """ from sklearn.preprocessing import normalize from scipy.spatial.distance import correlation ism_a = normalize(ism_a, norm='l2') ism_b = normalize(ism_b, norm='l2') distance = correlation(ism_a.ravel(), ism_b.ravel()) similarity = 1 - distance return similarity
def resample_symbols(rx_frame, rx_p_ref, intp_n=10): """ This function works around the imperfect-sampling-position problem. First, the received frame (rx_frame) is interpolated by intp_n times; Then, find a best downsample group by comparing to the reference preamble (rx_p_ref); at last, downsample and return the resampled frame (rx_resampled). """ rx_frame = np.concatenate([rx_frame, [rx_frame[-1]]]) p_len = len(rx_p_ref) nsymbol = len(rx_frame) # pad the signal with more detail before down-sampling x_origin = np.arange(0, nsymbol) x_interp = np.arange(0, nsymbol - 1, nsymbol / (nsymbol * intp_n)) f_interp = interpolate.interp1d(x_origin, rx_frame, 'cubic') rx_interp = f_interp(x_interp) rx_interp_left = np.concatenate([[rx_interp[0]] * intp_n, rx_interp[0:-1 * intp_n]]) rx_candicate = np.concatenate([ np.reshape(rx_interp_left, newshape=(intp_n, -1), order='F'), np.reshape(rx_interp, newshape=(intp_n, -1), order='F') ]) # The following line is to sort out a candidate sublist which has the # shortest distance from the reference signal. Execept for correlation, # other "distances": braycurtis,cosine,canberra,chebyshev,correlation dist = [correlation(candi, rx_p_ref) for candi in rx_candicate[:, 0:p_len]] rx_resampled = rx_candicate[np.argmin(dist)] return rx_resampled
def dist(a, b, t='euclidean'): if t == 'euclidean': return np.sqrt(np.sum((a - b)**2)) # or ssd.euclidean(a,b) elif t == 'correlation': return ssd.correlation(a, b) elif t == 'dtw': return dtw.dtw(a, b, distance_only=True).distance
def get_distance_vectors(vector1, vector2): mahalonobis_distance = distance.cityblock(vector1, vector2) cosine_distance = distance.cosine(vector1, vector2) correlation_distance = distance.correlation(vector1, vector2) return mahalonobis_distance, cosine_distance, correlation_distance
def kmeansClassify(A, means, distType = "euclidean"): codesErrors = [] for i in range(A.shape[0]): d = [0, sys.maxint] #check it against all means, and store the row index of mean with distance with mean for j in range(means.shape[0]): #calculate distance metrics other than euclidean if distType == "euclidean": newd = dist.euclidean(A[i,:], means[j,:]) elif distType == "cosine": newd = dist.cosine(A[i,:], means[j,:]) elif distType == "canberra": newd = dist.canberra(A[i,:], means[j,:]) elif distType == "manhattan": newd = dist.cityblock(A[i,:], means[j,:]) elif distType == "correlation": newd = dist.correlation(A[i,:], means[j,:]) elif distType == "hamming": newd = dist.hamming(A[i,:], means[j,:]) if newd < d[1]: d = [j, newd] codesErrors.append(d) return (np.matrix(codesErrors)[:,0], np.matrix(codesErrors)[:,1]) #returns the codes and errors
def computeDistance(X, Y, method): if 'cosine' in method: dist = spdistance.cosine(X, Y) elif 'dot' in method: dist = 1.0 - X.dot(Y) elif 'chi2' in method: dist = chiSquare2(X, Y) elif 'chi3' in method: dist = chiSquare3(X, Y) elif 'chi' in method: dist = chiSquare(X, Y) elif 'euclidean' in method: dist = cv2.norm(X, Y) elif 'canberra' in method: dist = spdistance.canberra(X, Y) elif 'correl' in method: dist = spdistance.correlation(X, Y) else: # does that work? dist = cv2.compareHist(X, Y, method) if hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_CORREL' in method: dist = 1 - dist elif hasattr(cv2, 'HISTCMP_CORREL') and 'cv2.HISTCMP_CORREL' in method: dist = 1 - dist elif hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_INTERSECT' in method: dist = 1 - dist elif hasattr(cv2, 'HISTCMP_INTERSECT') and 'cv2.HISTCMP_INTERSECT' in method: dist = 1 - dist return dist
def kMeans(self): for numClusters in range(self.minClusters, self.maxClusters + 1): self.gain[numClusters] = {} self.gain[numClusters]["avg"] = [] for rep in range(n): clustId = numClusters print "Running on %s clusters, rep %s" % (numClusters, rep + 1) self.gain[clustId]["labels"] = list(KMeans(numClusters).fit(np.array(self.data)).labels_) centroids = [[0 for x in range(len(self.data[0]))] for y in range(numClusters)] print "\tFinding Centroids" for index, pt in enumerate(self.data): cluster = self.gain[clustId]["labels"][index] prevCenter = centroids[cluster] centroids[cluster] = self._solve_centroid(pt, prevCenter) self.gain[clustId]["cosine"] = 0 self.gain[clustId]["cheby"] = 0 self.gain[clustId]["euclid"] = 0 self.gain[clustId]["jaccard"] = 0 for index, pt in enumerate(self.data): cluster = self.gain[clustId]["labels"][index] centroid = centroids[cluster] self.gain[clustId]["cosine"] += distance.cosine(centroid, pt) / len(self.data) self.gain[clustId]["cheby"] += distance.chebyshev(centroid, pt) / len(self.data) self.gain[clustId]["jaccard"] += distance.correlation(centroid, pt) / len(self.data) marginGain = self.bestMarginalGain(clustId, rep, centroids) if marginGain[0] is False: return marginGain[1], self.gain[marginGain[0]]["labels"] print "Max clusters is best marginal gain," + \ "consider rerunning with higher max" return self.maxClusters, self.gain[clustId]["labels"]
def calc_distance(v1,v2): cor=[] #print len(v1),len(v2) for vector in range(len(v1)): #print v1[vector],v2[vector] cor.append(correlation(v1[vector],v2[vector])) return sum(cor)
def corr_dspec_shape(data, proto, sigma=2.0): # validating feature sizes if data.shape[1] != proto.shape[1]: raise Exception('Both "data" and "prototypes" must have the same feature sizes.') # getting samples and prototypes count sc = data.shape[0] pc = proto.shape[0] # resulting dissimilarity representation d = np.zeros((sc, pc)) # derivative filter for both data and prototypes data2 = derfilter(data, sigma) proto2 = derfilter(proto, sigma) # normalizing each row by its maximum value data2 = np.apply_along_axis(lambda row: row / row.max(), 1, data2) proto2 = np.apply_along_axis(lambda row: row / row.max(), 1, proto2) # change here!!!!!! # TODO: Optimization here!: 1-list comprehension, 2-out=np.vstack(list_comprehension) for i in range(pc): t = np.apply_along_axis(lambda row: correlation(row, proto2[i, :]), 1, data2) d[:, i] = t # the dissimilarity representation return d
def corr_shape_measure(x, y, sigma=2.0): """Computes the shape dissimilarity value. Args: x (list): The first vector. y (list): The second vector. sigma (float): The smoothing parameter Returns: float: The shape dissimilarity value between vectors x and y. """ # getting the length of the vectors x_length = len(x) y_length = len(y) # validating parameters if x_length != y_length: raise Exception('Vectors with different sizes') # TODO: Here it is assumed that x and y are lists. Analyze the possibility for them to be tuples or numpy arrays # converting x and y to numpy arrays x_arr = np.array(x, np.float32) y_arr = np.array(y, np.float32) # applying a first gaussian derivative filter to both x_gauss = scipy_gauss1d(x_arr, sigma, order=1) y_gauss = scipy_gauss1d(y_arr, sigma, order=1) # computing the shape dissimilarity return correlation(x_gauss, y_gauss)
def similarity(a, b): # Get common elements and remove 0 values (no review) commons_a = [] commons_b = [] for j in range(0, len(a)): if a[j] != 0 and b[j] != 0: commons_a.append(a[j]) for j in range(0, len(b)): if b[j] != 0 and a[j] != 0: commons_b.append(b[j]) commons_count = len(commons_a) # If there are no common elements, return zero; otherwise # compute the coefficient if commons_count == 0: return 0 pearson_correlation = correlation(commons_a, commons_b) # If divisor is zero if math.isnan(pearson_correlation): return 0 return round(1 - pearson_correlation, 2)
def search(image): # Get feature maps from image feature_maps = Wear.get_feature_maps(image) # Searching in database wears = Database.connect().wears.find({}) # Set predictions to image predictions = [] for wear in wears: # Get feature maps from wear wear_fm = pickle.loads(str(wear['feature_maps']))['_feature_maps'] """ Calculating distances """ # Euclidean distance euclidean_distance = np.sqrt(np.sum((feature_maps - wear_fm)**2.)) # Cosine distance cosine_distance = cosine(feature_maps, wear_fm) # Correlation distance correlation_distance = correlation(feature_maps, wear_fm) predictions.append([str(wear['image']), str(wear['link']), euclidean_distance]) return predictions
def comparefiles(pypath, cudaresult, writeresult, dtype): # Takes 2 paths for the files to compare and a path to write the result. f = open(pypath, 'r') ff = open(cudaresult, 'r') if f.mode == 'r': data = np.loadtxt(f, dtype=dtype, converters={ 0: lambda s: complex(s.decode().replace( '+-', '-').replace('(', '').replace(')', '')) }) if ff.mode == 'r': data2 = np.loadtxt( ff, dtype=dtype, converters={0: lambda s: complex(s.decode().replace('+-', '-'))}) # WIP: other distance measurements might be more meaningful, this is a first try. euclideandst = distance.euclidean(data, data2) manhattendst = distance.cityblock(data, data2) correlationdst = distance.correlation(data, data2) # Print the output on cmd. print("Euclidiean Distance between the Scripts is:") print(euclideandst) print("Manhatten Distance between the Scripts is:") print(manhattendst) print("Correlation between the Scripts is:") print(correlationdst) # Write the output to custom path. result = open(writeresult, "a") result.write("Euclidiean Distance:" + str(euclideandst) + "\nManhatten Distance:" + str(manhattendst) + "\nCorrelation:" + str(correlationdst)) result.close()
def evaluate_centrality(object, clust): sumcentrality = 0 for elem in clust: dist = scidist.correlation(data_matrix[elem], data_matrix[object]) # print pearson sumcentrality += math.e ** (-10 * (dist ** 2)) return sumcentrality / float(len(clust))
def correlate(IM1,IM2): IM1 = list(IM1.ravel()) IM2 = list(IM2.ravel()) indexes = np.unique(np.concatenate((np.where(IM1 == 0),np.where(IM2 == 0.0654)),1)) for index in sorted(indexes, reverse=True): print index del IM1[index] del IM2[index] return correlation(IM1, IM2)
def correlation_MDS(data): seed = np.random.RandomState(seed=3) similarities = [[0 for x in range(len(data))] for x in range(len(data))] for i in range(len(data)): for j in range(len(data)): similarities[i][j] = correlation(data[i], data[j]) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit_transform(similarities) return pos
def GetCorr( XAxis, YAxis, ZAxis ): X = array(XAxis) Y = array(YAxis) Z = array(ZAxis) #Normalize X_A = (X - mean(X)) / (std(X) * len(X)) Y_A = (Y - mean(Y)) / std(Y) CorrA = 1.0-correlation(X_A, Y_A) Y_B = (Y - mean(Y)) / (std(Y) * len(Y)) Z_B = (Z - mean(Z)) / std(Z) CorrB = 1.0-correlation(Y_B, Z_B) Z_C = (Z - mean(Z)) / (std(Z) * len(Z)) X_C = (X - mean(X)) / std(X) CorrC = 1.0-correlation(Z_C, X_C) return CorrA, CorrB, CorrC
def compute_similarity(self, arr1, arr2): if self.simfcn == "cosine": return self.d_to_sim(cosine(arr1, arr2)) elif self.simfcn == "pearson": return self.d_to_sim(correlation(arr1, arr2)) elif self.simfcn == "hamming": return 1 - hamming(arr1, arr2) elif self.simfcn == "jaccard": return 1 - jaccard(arr1, arr2) else: print "Similiarity Function Not Yet Supported" exit()
def seqcor(m1,m2): l1 = len(m1) l2 = len(m2) l = max(l1, l2) # Create random sequence nucs = [] L = 10 ** 4 for i in range(L): nucs.append(random.choice(['A', 'C', 'T', 'G'])) random_seq = "".join(nucs) # Scan random sequence result1 = pwmscan(random_seq.upper(), m1.pwm, m1.pwm_min_score(), len(random_seq), False, True) result2 = pwmscan(random_seq.upper(), m2.pwm, m2.pwm_min_score(), len(random_seq), False, True) # Return maximum correlation c = [] for i in range(l1): c.append(1 - distance.correlation(result1[:L-l-i],result2[i:L-l])) for i in range(l2): c.append(1 - distance.correlation(result1[i:L-l],result2[:L-l-i])) return max(c)
def get_direction_score(direction_map, coord, matrix): sum = 0 x, y = matrix.shape[:2] a, b = coord point = matrix[a, b, ::] n = 0 for i, k in enumerate(direction_map): if k[0] is 0 and k[1] is 0: continue if 0 <= a + k[0] < x and 0 <= b + k[1] < y: neighbour = matrix[a + k[0], b + k[1],::] corr = correlation(point, neighbour) # It has a range of 0..2 and should be 0..1, so we divide by 2 # See http://stackoverflow.com/questions/35988933/scipy-distance-correlation-is-higher-than-1 sum += corr / 2 n += 1 return 1 - (sum / n)
def getSimilarity(baseDict, tidDict): # generates vectors for two tracks to be used in similarity function #returns similarity between base vector and track vector baseVector = [] tidVector = [] if len(baseDict) >= len(tidDict) : for (k,v) in baseDict.iteritems(): baseVector.append(v) if k in tidDict: tidVector.append(tidDict[k]) else: tidVector.append(0) else: for (k,v) in tidDict.iteritems(): tidVector.append(v) if k in baseDict: baseVector.append(baseDict[k]) else: baseVector.append(0) return dis.correlation(baseVector, tidVector)
def main(): print "# KNN Classifier" parser = ld.parse_arguments() stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) # priting args print '\t-k = ' + str(parser.k) print '\t-d = ' + parser.distance # loading the necessary data (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords) print "# Tamanho do vocabulário:", len(vocabulary) # transforming each item to a v-dimensional space (train, test) = space.transform(vocabulary, parser.train_path, parser.test_path) # output file out_path = parser.distance + '_' + str(parser.k) out_path += '.txt' out_file = open(out_path, 'w') # knn classification print "# Classifying", len(train) * parser.percentage for item in test: dist_heap = [] # calculates the distance to every point in the training set for i in xrange(int(len(train) * parser.percentage)): point = train[i] distance = 0.0 if parser.distance == 'cosine': distance = spd.cosine(item, point) elif parser.distance == 'jaccard': distance = spd.jaccard(item, point) elif parser.distance == 'euclidean': distance = spd.euclidean(item, point) elif parser.distance == 'hamming': distance = spd.hamming(item, point) elif parser.distance == 'correlation': distance = spd.correlation(item, point) elif parser.distance == 'manhattan': distance = spd.cityblock(item, point) else: print >> stderr, "ERRO! - Distância informada inválida." exit() tup = (distance, i) heapq.heappush(dist_heap, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, dist_heap) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = neigh_classes[idi] classification[int(classe)] += 1 # DEBUG print classification, # outputing classification if(classification[0] >= classification[1]): print >> out_file, '0' print '0' else: print >> out_file, '1' print '1' print print "# Resultados salvos no arquivo: " + out_path out_file.close() result.result("../data/imdb_test", out_path)
def correlation(itemset1, itemset2): return dist.correlation(itemset1, itemset2)
def main(): print "# KNN Classifier" parser = ld.parse_arguments() # priting args print '\t-k = ' + str(parser.k) print '\t-d = ' + parser.distance stopwords = None if parser.stopwords_path: stopwords = ld.load_stopwords(parser.stopwords_path) voc = load_vocabulary(parser.train_path, stopwords) answers = load_answers(parser.train_path) train = transform(voc, parser.train_path) test = transform(voc, parser.test_path) # output file out_path = '../results/' + parser.distance + '_' + str(parser.k) out_path += '.txt' out_file = open(out_path, 'w') for point in test: neighbors = [] for i in xrange(len(train)): neigh = train[i] distance = 0.0 if parser.distance == 'cosine': distance = spd.cosine(neigh, point) elif parser.distance == 'jaccard': distance = spd.jaccard(neigh, point) elif parser.distance == 'euclidean': distance = spd.euclidean(neigh, point) elif parser.distance == 'dice': distance = spd.dice(neigh, point) elif parser.distance == 'correlation': distance = spd.correlation(neigh, point) elif parser.distance == 'manhattan': distance = spd.cityblock(neigh, point) else: print >> stderr, "ERRO! - Distância informada inválida." exit() tup = (distance, i) heapq.heappush(neighbors, tup) # return the highest k similar points top_k = heapq.nsmallest(parser.k, neighbors) # classifing classification = np.zeros(2) for (_, idi) in top_k: classe = answers[idi] classification[int(classe)] += 1 # outputing classification if(classification[0] >= classification[1]): print >> out_file, '0' print '0' else: print >> out_file, '1' print '1' # outputing the results' print print "# Resultados salvos no arquivo: " + out_path out_file.close() result.result("../data/imdb_test", out_path)
def proj3(pos, x): # pos: position, x: new data point if pos == 'C': # XTrain: training data for NB, yTrain: training labels for NB XTrain = np.loadtxt('Ctrain.txt') #yTrain = np.loadtxt('CtrainL.txt') XTest = np.loadtxt('C.txt') yTest = np.loadtxt('Clabs.txt') elif pos == 'PF': XTrain = np.loadtxt('PFtrain.txt') #yTrain = np.loadtxt('PFtrainL.txt') XTest = np.loadtxt('PF.txt') yTest = np.loadtxt('PFlabs.txt') elif pos == 'PG': XTrain = np.loadtxt('PGTrain.txt') #yTrain = np.loadtxt('PGtrainL.txt') XTest = np.loadtxt('PG.txt') yTest = np.loadtxt('PGlabs.txt') elif pos == 'SF': XTrain = np.loadtxt('SFTrain.txt') #yTrain = np.loadtxt('SFtrainL.txt') XTest = np.loadtxt('SF.txt') yTest = np.loadtxt('SFlabs.txt') elif pos == 'SG': XTrain = np.loadtxt('SGTrain.txt') #yTrain = np.loadtxt('SGtrainL.txt') XTest = np.loadtxt('SG.txt') yTest = np.loadtxt('SGlabs.txt') else: print "Please reinput the position." D = XTrain.shape[1] # number of features #NBtr = GaussianNB() #NBtr.fit(XTrain, yTrain) #ytr = NBtr.predict(x) # predicting the class of x (w.r.t training data) NBte = GaussianNB() NBte.fit(XTest, yTest) yTrain = NBte.predict(XTrain) # predicting the classes of XTrain's rows yte = NBte.predict(x) # predicting the class of x (w.r.t testing data) '''create training data from the players (2009-2011) in the same class as x (ytr)''' tmpTrain = np.zeros(D) # create training data for Ranking SVM TrIndex = [] # store the indices of tmpTrain for i in range(len(yTrain)): if yTrain[i] == yte: # the same class as new data point tmpTrain = np.vstack((tmpTrain, XTrain[i])) TrIndex = np.append(TrIndex, i) else: # different classes from new data point pass tmpTrain = np.delete(tmpTrain, 0, 0) # delete the initializing row '''calculate correlation distances between rows of tmpTrain and x''' TrCorrD = np.zeros(np.shape(TrIndex)) # initialize correlation distances for i in range(len(TrCorrD)): TrCorrD[i] = spd.correlation(tmpTrain[i], x) TrRank = np.argsort(TrCorrD) if len(TrIndex) < 10: noTrPts = len(TrIndex) else: noTrPts = 10 # select top 10 relevant training points vecTrain = tmpTrain[TrRank[:noTrPts]] #print TrIndex[TrRank[:noTrPts]] '''create training feature vectors''' noFt = 2 # number of features ftTrain = np.zeros((noTrPts,noFt)) for i in range(noTrPts): ftTrain[i] = np.array([spd.euclidean(vecTrain[i],x), spd.cosine(vecTrain[i],x)]) '''create taining matrix and labels for SVM from vecTrain and TrRank''' SVMTrain = np.zeros((noTrPts*(noTrPts-1), noFt)) SVMLabel = np.zeros(np.shape(SVMTrain)[0]) - 1 for i in range(noTrPts): for j in range(noTrPts): if i > j: SVMTrain[i*(noTrPts-1)+j] = ftTrain[i] - ftTrain[j] if TrRank[i] < TrRank[j]: # smaller rank => closer distance SVMLabel[i*(noTrPts-1)+j] = 1 else: SVMLabel[i*(noTrPts-1)+j] = 0 elif i < j: SVMTrain[i*(noTrPts-1)+j-1] = ftTrain[i] - ftTrain[j] if TrRank[i] < TrRank[j]: SVMLabel[i*(noTrPts-1)+j-1] = 1 else: SVMLabel[i*(noTrPts-1)+j-1] = 0 else: pass #if i == j, pass '''create testing data from the players (2011-2015) in the same class as x (ytr)''' tmpTest = np.zeros(D) # extract data of the same class of x in testing data TeIndex = [] # store the indices of testing data for i in range(len(yTest)): if yTest[i] == yte: # the same class as new data point tmpTest = np.vstack((tmpTest, XTest[i])) TeIndex = np.append(TeIndex, i) else: pass tmpTest = np.delete(tmpTest, 0, 0) # delete the initializing row '''calculate correlation distances between testing data and x''' TeCorrD = np.zeros(np.shape(TeIndex)) for i in range(len(TeCorrD)): TeCorrD[i] = spd.correlation(tmpTest[i], x) TeRank = np.argsort(TeCorrD) noTePts = noTrPts # select top 10 relevant testing points vecTest = tmpTest[TeRank[:noTePts]] #print TeIndex[TeRank[:10]] '''calculate NDCG''' TeGrade = np.arange(noTePts,0,-1) TeGains = 2 ** TeGrade - 1 TeDisct = 1 / np.log2(np.arange(2,2+noTePts)) TeDcg = np.zeros((noTePts)) for i in range(noTePts): TeDcg[i] = TeDcg[i-1] + TeGains[i]*TeDisct[i] '''create testing feature vectors''' ftTest = np.zeros((noTePts,noFt)) for i in range(noTePts): ftTest[i] = np.array([spd.euclidean(vecTest[i],x), spd.cosine(vecTest[i],x)]) '''create testing matrix and labels for SVM from vecTest and TeRank''' SVMTest = np.zeros((noTePts*(noTePts-1), noFt)) TeLabs = np.zeros(np.shape(SVMTest)[0]) - 1 # testing labels (used as a comparion with results) for i in range(noTePts): for j in range(noTePts): if i > j: SVMTest[i*(noTePts-1)+j] = ftTest[i] - ftTest[j] if TeRank[i] < TeRank[j]: TeLabs[i*(noTePts-1)+j] = 1 else: TeLabs[i*(noTePts-1)+j] = 0 elif i < j: SVMTest[i*(noTePts-1)+j-1] = ftTest[i] - ftTest[j] if TeRank[i] < TeRank[j]: TeLabs[i*(noTePts-1)+j-1] = 1 else: TeLabs[i*(noTePts-1)+j-1] = 0 else: pass '''train the ranking SVM''' clf = SVC(C=0.01, kernel='linear') clf.fit(SVMTrain, SVMLabel) pred_labels = clf.predict(SVMTest) # predict labels visTest = np.reshape(pred_labels, (noTePts,noTePts-1)) # make the testing results visualized ids = (-np.sum(visTest, axis=1)).argsort()[:noTePts] # descending order ReIndex = TeIndex[TeRank[:noTePts]][ids] # ranking svm results of testing data (indices) MatchPlayerList = np.int_(ReIndex) MatchPlayer = list(MatchPlayerList)[0] return MatchPlayer '''calculate NDCG''' ReGrade = np.zeros((noTePts)) for i in range(noTePts): for j in range(noTePts): if ReIndex[i] == TeIndex[TeRank[:noTePts]][j]: ReGrade[i] = TeGrade[j] else: pass ReGains = 2 ** ReGrade - 1 ReDisct = TeDisct ReDcg = np.zeros((noTePts)) for i in range(noTePts): ReDcg[i] = ReDcg[i-1] + ReGains[i]*ReDisct[i] ReNdcg = ReDcg / TeDcg
def correlation_similarity(x, y): c = distance.correlation(x, y) if np.isnan(c): return 0 else: return 1 - c
X_pos = [] X_neg = [] X_obj = [] fileName = TRAINING_FILES_PATTERN+Value for line in fileinput.input([fileName]): split = line.split("\t") if split[0] == POSITIVE_POLARITY_FOR_SCORER: X_pos.append(float(split[1])) elif split[0] == NEGATIVE_POLARITY_FOR_SCORER: X_neg.append(float(split[1])) else: X_obj.append(float(split[1])) return X_pos, X_neg, X_obj max_pos, max_neg, max_obj = plotter(maxValue) min_pos, min_neg, min_obj = plotter(minValue) max = max_pos + max_neg + max_obj min = min_pos + min_neg + min_obj print max print min print "Corelation is: "+ str(correlation(max, min)) plt.plot(max, min, marker = 'o', ls='') plt.xlabel("Value of alpha as 10^0") plt.ylabel("Value of alpha as 10^-7") plt.show()
def plot_blockades(blockades_file, model_files, cluster_size, show_text): """ Pretty plotting """ WINDOW = 4 blockades = read_mat(blockades_file) clusters = sp.preprocess_blockades(blockades, cluster_size=cluster_size, min_dwell=0.5, max_dwell=20) peptide = clusters[0].blockades[0].peptide models = [] for model_file in model_files: models.append(load_model(model_file)) #svr_signal = model.peptide_signal(peptide) #mv_signal = MvBlockade().peptide_signal(peptide) for cluster in clusters: #cluster.consensus = sp.discretize(cluster.consensus, len(peptide)) signal_length = len(cluster.consensus) x_axis = np.linspace(0, len(peptide) + 1, signal_length) matplotlib.rcParams.update({"font.size": 16}) fig = plt.subplot() fig.spines["right"].set_visible(False) fig.spines["top"].set_visible(False) fig.get_xaxis().tick_bottom() fig.get_yaxis().tick_left() fig.set_xlim(0, len(peptide) + 1) fig.set_xlabel("Putative AA position") fig.set_ylabel("Normalized signal") fig.plot(x_axis, cluster.consensus, label="Empirical signal", linewidth=1.5) ################ for model in models: model_signal = model.peptide_signal(peptide) model_grid = [i * signal_length / (len(model_signal) - 1) for i in xrange(len(model_signal))] interp_fun = interp1d(model_grid, model_signal, kind="linear") model_interp = interp_fun(xrange(signal_length)) corr = 1 - distance.correlation(cluster.consensus, model_interp) print("{0} correlation: {1:5.2f}\t".format(model.name, corr), file=sys.stderr) fig.plot(x_axis, model_interp, label=model.name, linewidth=2) ############## legend = fig.legend(loc="lower left", frameon=False) for label in legend.get_lines(): label.set_linewidth(2) for label in legend.get_texts(): label.set_fontsize(16) if show_text: #adding AAs text: event_mean = np.mean(cluster.consensus) acids_pos = _get_aa_positions(peptide, WINDOW, x_axis[-1]) for i, aa in enumerate(peptide): fig.text(acids_pos[i], event_mean - 2, aa, fontsize=16) plt.show()
DEVELOPMENT_KEY = '../../dev.key' SCORER_SCRIPT = '../../scorer.py' WEIGHTS_FILE = "averaged_weight_vector_akj" LOG_POSITIVE_FILE = "versus_file_bayes" DISTINCT_TOKENS = 11083 for line in fileinput.input([GENERATED_FILES_DIRECTORY+WEIGHTS_FILE]): if len(line) == 0: break lister = line.split(" ") positive_weights = [] for weight in lister[:DISTINCT_TOKENS]: append_weight = float(weight) positive_weights.append(append_weight) #Need to plot positive weights against the values in versus_bayes log_weights = [] actual_positives = [] for line in fileinput.input([GENERATED_FILES_DIRECTORY+LOG_POSITIVE_FILE]): lister = line.split("\t") log_weights.append(float(lister[1])) actual_positives.append(positive_weights[int(lister[2])]) plt.plot(log_weights, actual_positives, marker ='o', ls ='') plt.xlabel("Log weights of positive tokens (Naive Bayes Classifier - ALPHA = 10^-3)") plt.ylabel("Weights of the positive tokens (Perceptron Classifier)") print "Corelation is: "+ str(correlation(log_weights, actual_positives)) #plt.plot(log_weights, 'rs') plt.show()
def correlate(IM1,IM2): score = correlation(IM1.ravel(), IM2.ravel()) return score
def evaluate(self, dataset, vectorizer, high_dim_kron=False): pairs = list(dataset.dependency_graphs_pairs()) # TODO: Refactor to mimic scikit-learn pipeline. sent_vectors = ( ( g1, vectorizer.vectorize(g1), g2, vectorizer.vectorize(g2), score, ) + tuple(extra) for g1, g2, score, *extra in pairs ) if not high_dim_kron: sent_vectors = ( (g1, v1.toarray().flatten(), g2, v2.toarray().flatten(), score) + tuple(extra) for g1, v1, g2, v2, score, *extra in sent_vectors ) result_values = ( ( g1, g2, 1 / (1 + distance.euclidean(v1, v2)), 1 - distance.cosine(v1, v2), 1 - distance.correlation(v1, v2), v1.dot(v2.T), score, ) + tuple(extra) for g1, v1, g2, v2, score, *extra in sent_vectors ) result_columns = ( 'euclidean', 'cos', 'correlation', 'inner_product', ) else: result_values = ( ( g1, g2, (v1 * s1).dot(v2 * s2).T * (v1 * o1).dot((v2 * o2).T), score, ) + tuple(extra) for g1, (s1, v1, o1), g2, (s2, v2, o2), score, *extra in sent_vectors ) result_columns = ( 'inner_product', ) result = pd.DataFrame.from_records( [ (tree(g1), tree(g2)) + tuple(rest) for g1, g2, *rest in self.progressify( result_values, description='Similarity', max=len(pairs), ) ], columns=( ('unit1', 'unit2', ) + result_columns + ('score', ) + getattr(dataset, 'extra_fields', tuple()) ) ) if not result.notnull().all().all(): logger.warning('Null values in similarity scores.') for column in result_columns: rho, p = stats.spearmanr(result[[column, 'score']]) print( 'Spearman correlation {info}, {column}: ' '{style.BOLD}rho={rho:.3f}{style.RESET}, p={p:.5f}, support={support}' .format( rho=rho, p=p, style=style, info=vectorizer.info(), support=len(result), column=column, ) ) return result
#print "question cosine similairity-->",cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) #print "answer cosine similarity-->",cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qcosines=cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) Acosines=cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qbray=[dist.braycurtis(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Abray=[dist.braycurtis(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcanberra=[dist.canberra(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acanberra=[dist.canberra(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qhamming=[dist.hamming(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Ahamming=[dist.hamming(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcorrelation=[dist.correlation(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acorrelation=[dist.correlation(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcityblock=[dist.cityblock(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acityblock=[dist.cityblock(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qdice=[dist.dice(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Adice=[dist.dice(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qyule=[dist.yule(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Ayule=[dist.yule(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] #C_Q=np.histogram2d(QuestionTVectorArray[1],QuestionTVectorArray[1])[0] #print "question mutual info-->",mutual_info_score(None,None,contigency=C_Q)#QuestionTVectorArray[0:1],QuestionTVectorArray) #QuestionVectorArray=Qvectorizer.fit_transform(all_questions).toarray()
def correlation((x, y)): # return list(pearsonr(x, y)) return distance.correlation(x, y)
def wvCorr(a): return [distance.correlation(x[0], x[1]) for x in a]
def pearson_corr(ind1, ind2, matrix, _): # TODO: fix mean v1 = matrix[ind1] v2 = matrix[ind2] return 1 - distance.correlation(v1, v2)