def __calc_distances__(self, v1s, v2s, is_sparse=True): if is_sparse: dcosine = np.array([cosine(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x.toarray().ravel()) for x in v1s] dskew_q2 = [skew(x.toarray().ravel()) for x in v2s] dkur_q1 = [kurtosis(x.toarray().ravel()) for x in v1s] dkur_q2 = [kurtosis(x.toarray().ravel()) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) else: dcosine = np.array([cosine(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x) for x in v1s] dskew_q2 = [skew(x) for x in v2s] dkur_q1 = [kurtosis(x) for x in v1s] dkur_q2 = [kurtosis(x) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
def compute_Correlation_matrix(patients_tri, triclusters): final_matrix = list() print(len(patients_tri)) for bic_p in patients_tri: line = list() for tric in triclusters: corr_list = list() for tric_p in tric.getPatients(): tric_tps = tric.getTimes() tric_fs = tric.getSamples() for fo in tric_fs: p_slice = bic_p.getSlice(c=fo)[:len(tric_tps)] #print("fop", p_slice) t_slice = tric.getSlice(g=tric_p, c=fo) #print("fot", t_slice) corr_list.append(1 - distance.canberra(p_slice, t_slice)) #print(corr_list) for to in tric_tps: p_slice = bic_p.getSlice(t=to)[:len(tric_fs)] #print("top", p_slice) t_slice = tric.getSlice(g=tric_p, t=to) #print("tot", t_slice) # corr_list.append(1 - distance.canberra(p_slice, t_slice)) #print(corr_list) #rint(corr_list) line.append(stat.mean(corr_list)) #print(len(line),line) final_matrix.append(line) return final_matrix
def calculateL2(self, feat1, feat2, c_type='euclidean'): assert np.shape(feat1) == np.shape(feat2) if config.insight: [ len_, ] = np.shape(feat1) #print(np.shape(feat1)) else: _, len_ = np.shape(feat1) #print("len ",len_) if c_type == "cosine": s_d = distance.cosine(feat1, feat2) elif c_type == "euclidean": #s_d = np.sqrt(np.sum(np.square(feat1-feat2))) #s_d = distance.euclidean(feat1,feat2,w=1./len_) s_d = distance.euclidean(feat1, feat2, w=1) elif c_type == "correlation": s_d = distance.correlation(feat1, feat2) elif c_type == "braycurtis": s_d = distance.braycurtis(feat1, feat2) elif c_type == 'canberra': s_d = distance.canberra(feat1, feat2) elif c_type == "chebyshev": s_d = distance.chebyshev(feat1, feat2) return s_d
def distance_features(data,genismModel): w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1]) w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2]) a=np.zeros(300) for i in range(len(w2v_q1)): if w2v_q1[i].size==1: w2v_q1[i]=a for i in range(len(w2v_q2)): if w2v_q2[i].size==1: w2v_q2[i]=a data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)] data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['skew_q1vec'] = [skew(x) for x in w2v_q1] data['skew_q2vec'] = [skew(x) for x in w2v_q2] data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1] data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2] fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec', 'skew_q2vec','kur_q1vec','kur_q2vec'] return data,fs_4
def feature3(data): question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] return data
def computeDistance(X, Y, method): if 'cosine' in method: dist = spdistance.cosine(X, Y) elif 'dot' in method: dist = 1.0 - X.dot(Y) elif 'chi2' in method: dist = chiSquare2(X, Y) elif 'chi3' in method: dist = chiSquare3(X, Y) elif 'chi' in method: dist = chiSquare(X, Y) elif 'euclidean' in method: dist = cv2.norm(X, Y) elif 'canberra' in method: dist = spdistance.canberra(X, Y) elif 'correl' in method: dist = spdistance.correlation(X, Y) else: # does that work? dist = cv2.compareHist(X, Y, method) if hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_CORREL' in method: dist = 1 - dist elif hasattr(cv2, 'HISTCMP_CORREL') and 'cv2.HISTCMP_CORREL' in method: dist = 1 - dist elif hasattr(cv2, 'cv') and 'cv2.cv.CV_COMP_INTERSECT' in method: dist = 1 - dist elif hasattr(cv2, 'HISTCMP_INTERSECT') and 'cv2.HISTCMP_INTERSECT' in method: dist = 1 - dist return dist
def extend_with_features(data): stop_words = stopwords.words('english') data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) data['wmd'] = data.apply( lambda x: wmd(model, x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply( lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question1.values): question1_vectors[i, :] = sent2vec(model, q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question2.values): question2_vectors[i, :] = sent2vec(model, q) question1_vectors = np.nan_to_num(question1_vectors) question2_vectors = np.nan_to_num(question2_vectors) data['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['skew_q1vec'] = [skew(x) for x in question1_vectors] data['skew_q2vec'] = [skew(x) for x in question2_vectors] data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors] data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors] return data
def vectors_features(in_data: pd.DataFrame, sent2vec: Callable[[str], np.array]) -> pd.DataFrame: assert "question1" in in_data.columns assert "question2" in in_data.columns vectors1 = np.array([sent2vec(x) for x in in_data['question1']]) vectors2 = np.array([sent2vec(x) for x in in_data['question2']]) in_data['cos'] = np.array( [cosine(x, y) for x, y in zip(vectors1, vectors2)]) in_data['jaccard'] = np.array( [jaccard(x, y) for x, y in zip(vectors1, vectors2)]) in_data['euclidean'] = np.array( [euclidean(x, y) for x, y in zip(vectors1, vectors2)]) in_data['minkowski'] = np.array( [minkowski(x, y) for x, y in zip(vectors1, vectors2)]) in_data['cityblock'] = np.array( [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['canberra'] = np.array( [canberra(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['braycurtis'] = np.array( [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['skew_q1'] = np.array([skew(x) for x in vectors1]) in_data['skew_q2'] = np.array([skew(x) for x in vectors2]) in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1]) in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2]) in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2']) in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2']) return in_data
def similarity_function(x, y): """ Similarity function for comparing user features. This actually really should be implemented in taar.similarity_recommender and then imported here for consistency. """ def safe_get(field, row, default_value): # Safely get a value from the Row. If the value is None, get the # default value. return row[field] if row[field] is not None else default_value # Extract the values for the categorical and continuous features for both # the x and y samples. Use an empty string as the default value for missing # categorical fields and 0 for the continuous ones. x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES] y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES] x_continuous_features = [ float(safe_get(k, x, 0)) for k in CONTINUOUS_FEATURES ] y_continuous_features = [ float(safe_get(k, y, 0)) for k in CONTINUOUS_FEATURES ] # Here a larger distance indicates a poorer match between categorical variables. j_d = distance.hamming(x_categorical_features, y_categorical_features) j_c = distance.canberra(x_continuous_features, y_continuous_features) # Take the product of similarities to attain a univariate similarity score. # Add a minimal constant to prevent zero values from categorical features. # Note: since both the distance function return a Numpy type, we need to # call the |item| function to get the underlying Python type. If we don't # do that this job will fail when performing KDE due to SPARK-20803 on # Spark 2.2.0. return abs((j_c + 0.001) * j_d).item()
def get_w2v_simi(query, title): q_vec = np.nan_to_num(sent2vec(query)) t_vec = np.nan_to_num(sent2vec(title)) w2v_consine = cosine(q_vec, t_vec) w2v_cityblock = cityblock(q_vec, t_vec) w2v_jaccard = jaccard(q_vec, t_vec) w2v_canberra = canberra(q_vec, t_vec) w2v_euclidean = euclidean(q_vec, t_vec) w2v_minkowski = minkowski(q_vec, t_vec) w2v_braycurtis = braycurtis(q_vec, t_vec) w2v_skew_qvec = skew(q_vec) w2v_skew_tvec = skew(t_vec) w2v_kur_qvec = kurtosis(q_vec) w2v_kur_tvec = kurtosis(t_vec) outlist = [w2v_consine, w2v_cityblock, w2v_jaccard, w2v_canberra, w2v_euclidean, w2v_minkowski, w2v_braycurtis, w2v_skew_qvec, w2v_skew_tvec, w2v_kur_qvec, w2v_kur_tvec ] outformat = ':'.join(['{}']*len(outlist)) return outformat.format(*outlist)
def kmeansClassify(A, means, distType = "euclidean"): codesErrors = [] for i in range(A.shape[0]): d = [0, sys.maxint] #check it against all means, and store the row index of mean with distance with mean for j in range(means.shape[0]): #calculate distance metrics other than euclidean if distType == "euclidean": newd = dist.euclidean(A[i,:], means[j,:]) elif distType == "cosine": newd = dist.cosine(A[i,:], means[j,:]) elif distType == "canberra": newd = dist.canberra(A[i,:], means[j,:]) elif distType == "manhattan": newd = dist.cityblock(A[i,:], means[j,:]) elif distType == "correlation": newd = dist.correlation(A[i,:], means[j,:]) elif distType == "hamming": newd = dist.hamming(A[i,:], means[j,:]) if newd < d[1]: d = [j, newd] codesErrors.append(d) return (np.matrix(codesErrors)[:,0], np.matrix(codesErrors)[:,1]) #returns the codes and errors
def calculate_distance(X, Y, metric='euclidean'): if metric == METRIC_EUCLIDEAN: return distance.euclidean(X, Y) elif metric == METRIC_JACCARD: return distance.jaccard(X, Y) elif metric == METRIC_CANBERRA: return distance.canberra(X, Y) elif metric == METRIC_CHEBYSHEV: return distance.chebyshev(X, Y) elif metric == METRIC_MINKOWSKI: return distance.minkowski(X, Y) elif metric == METRIC_WMINKOWSKI: return distance.wminkowski(X, Y) elif metric == METRIC_BRAYCURTIS: return distance.braycurtis(X, Y) elif metric == METRIC_HAMMING: return distance.hamming(X, Y) elif metric == METRIC_MAHALANOBIS: return distance.mahalanobis(X, Y) elif metric == METRIC_MANHATTAN: return sum(abs(a - b) for a, b in zip(X, Y)) elif metric == METRIC_COSINE: dot_product = np.dot(X, Y) norm_a = np.linalg.norm(X) norm_b = np.linalg.norm(Y) return dot_product / (norm_a * norm_b)
def calculate_featureset4(dataframe, q1_vectors, q2_vectors): dataframe['cosine_dist'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['cityblock_dist'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['jaccard_dist'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['canberra_dist'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['euclidean_dist'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['minkowski_dist'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['braycurtis_dist'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['skew_q1'] = [skew(x) for x in np.nan_to_num(q1_vectors)] dataframe['skew_q2'] = [skew(x) for x in np.nan_to_num(q2_vectors)] dataframe['kurtosis_q1'] = [kurtosis(x) for x in np.nan_to_num(q1_vectors)] dataframe['kurtosis_q2'] = [kurtosis(x) for x in np.nan_to_num(q2_vectors)] return dataframe
def features_similarity(cls, df): cls.load_model(normed=True) question1_vectors, question2_vectors = cls.get_questions_vector(df) cls.resetmodel() cls.dict_features['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("1/11 Cosine Distance finished.") cls.dict_features['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("2/11 Cityblock Distance finished.") cls.dict_features['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("3/11 Jaccard Distance finished.") cls.dict_features['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("4/11 Canberra Distance finished.") cls.dict_features['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("5/11 Euclidean Distance finished.") cls.dict_features['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("6/11 Minkowski Distance finished.") cls.dict_features['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("7/11 Braycurtis Distance finished.") cls.dict_features['skew_q1vec'] = [ skew(x) for x in np.nan_to_num(question1_vectors) ] print("8/11 Skew Q1 Vec finished.") cls.dict_features['skew_q2vec'] = [ skew(x) for x in np.nan_to_num(question2_vectors) ] print("9/11 Skew Q2 Vec finished.") cls.dict_features['kur_q1vec'] = [ kurtosis(x) for x in np.nan_to_num(question1_vectors) ] print("10/11 Kurtosis Q1 Vec finished.") cls.dict_features['kur_q2vec'] = [ kurtosis(x) for x in np.nan_to_num(question2_vectors) ] print("11/11 Kurtosis Q2 Vec finished.") return question1_vectors, question2_vectors
def canberra(self): ''' Euclidean distance between two conn matrices. The matrices are vectorized ''' vec1 = self._vectorize(self.conn1) vec2 = self._vectorize(self.conn2) return distance.canberra(vec1, vec2)
def main(): a = [1, 2, 3, 4, 5] b = [2, 3, 4, 6, 8] result = distance.canberra(a, b) # list_elem = ['canberra','chebyshev','cityblock','euclidean','jaccard','hamming'] # print(get_all_combinations(list_elem)) print(result)
def canb(m, n): k = [] for i in range(len(m)): k.append(distance.canberra(m[i].toarray(), n[i].toarray())) for i in range(len(k)): k[i] = round(k[i], 2) print('canb') return k
def point_distance(point_a, point_b, type="graph", map="de_dust2"): """ Returns the distance between two points using a given method on a given map (if needed) Args: point_a: A list of floats or ints containing the position of point A point_b: A list of floats or ints containing the position of point B type: A string that is one of 'euclidean', 'manhattan', 'canberra', 'cosine' or 'graph'. Using 'graph' will use A* to find the shortest path and counts the discrete areas it travels. map: A string indicating the map """ if map not in [ "de_dust2", "de_cbble", "de_inferno", "de_mirage", "de_nuke", "de_overpass", "de_train", "de_vertigo", ]: raise ValueError( f'Invalid map name: got {map}, expected one of: "de_dust2", "de_cbble", "de_inferno", "de_mirage", "de_nuke", "de_overpass", "de_train", "de_vertigo"' ) if type == "graph": path = os.path.join(os.path.dirname(__file__), "") proc = subprocess.Popen( [ "go", "run", "path_distance.go", "-map", map, "-start_x", str(point_a[0]), "-start_y", str(point_a[1]), "-start_z", str(point_a[2]), "-end_x", str(point_b[0]), "-end_y", str(point_b[1]), "-end_z", str(point_b[2]), ], stdout=subprocess.PIPE, cwd=path, ) return int(proc.stdout.read()) elif type == "euclidean": return distance.euclidean(point_a, point_b) elif type == "manhattan": return distance.cityblock(point_a, point_b) elif type == "canberra": return distance.canberra(point_a, point_b) elif type == "cosine": return distance.cosine(point_a, point_b)
def feats_tfidf(row): out_list = [] que1 = str(row['question1']) que2 = str(row['question2']) #Calculate que1 lsa vector que1_vec = [] que1_bow = dictionary.doc2bow(que1.lower().split()) que1_lsi = lsi[que1_bow] for (index, value) in que1_lsi: que1_vec.append(value) #Calculate que2 lsa vector que2_vec = [] que2_bow = dictionary.doc2bow(que2.lower().split()) que2_lsi = lsi[que2_bow] for (index, value) in que2_lsi: que2_vec.append(value) #drop some dimensions if they don't match if len(que1_vec) != len(que2_vec): if len(que1_vec) > len(que2_vec): que1_vec = que1_vec[:len(que2_vec)] que2_vec = que2_vec else: que1_vec = que1_vec que2_vec = que2_vec[:len(que1_vec)] #Calculate distances between lsa vectors try: lsa_cosine = cosine(que1_vec, que2_vec) except: lsa_cosine = 1 lsa_cityblock = cityblock(que1_vec, que2_vec) lsa_jaccard = jaccard(que1_vec, que2_vec) lsa_canberra = canberra(que1_vec, que2_vec) try: lsa_euclidean = euclidean(que1_vec, que2_vec) except: lsa_euclidean = np.nan lsa_minkowski = minkowski(que1_vec, que2_vec, 3) lsa_braycurtis = braycurtis(que1_vec, que2_vec) lsa_q1_skew = skew(que1_vec) lsa_q1_kurtosis = kurtosis(que1_vec) lsa_q2_skew = skew(que2_vec) lsa_q2_kurtosis = kurtosis(que2_vec) out_list.extend([lsa_cosine,lsa_cityblock,lsa_jaccard,lsa_canberra,lsa_euclidean, \ lsa_minkowski,lsa_braycurtis,lsa_q1_skew,lsa_q1_kurtosis,lsa_q2_skew, lsa_q2_kurtosis]) return out_list
def canberraDist(h, b, bigram_vectorizer): corpus = [] corpus.append(h) corpus.append(b) vecs = bigram_vectorizer.fit_transform(corpus).toarray() vec1, vec2 = vecs[0, :], vecs[1, :] #print (canberra(vec1,vec2)) return canberra(vec1, vec2) / 1000
def canberraDist(h,b, bigram_vectorizer): corpus = [] corpus.append(h) corpus.append(b) vecs = bigram_vectorizer.fit_transform(corpus).toarray() vec1, vec2 = vecs[0,:],vecs[1,:] #print (canberra(vec1,vec2)) return canberra(vec1,vec2)/1000
def calc_nearest_to(nearest): pics = read_json() selected = pics.pop(str(nearest)) nearest_pics = {} for k, v in pics.items(): nearest_pics[k] = canberra(selected, v) nearest_pics = {a: b for a, b in sorted(nearest_pics.items(), key=lambda item: item[1])} nearest_pics = dict(itertools.islice(nearest_pics.items(), 20)) return nearest_pics
def pair_coherence(self, word_i, word_j, metric=None): if(metric=="correlation"): return 1 - distance.correlation(self.model[word_i], self.model[word_j]) if(metric=="chebyshev"): return 1 - distance.chebyshev(self.model[word_i], self.model[word_j]) if(metric=="euclidean"): return 1 - distance.euclidean(self.model[word_i], self.model[word_j]) if(metric=="canberra"): return 1 - distance.canberra(self.model[word_i], self.model[word_j]) return self.model.similarity(word_i,word_j)
def similar(q1, q2): v1, v2 = sent2vec(q1, q2) cos = cosine(v1, v2) if (cos < -1): deg = math.degrees(math.acos(-1)) elif (cos > 1): deg = math.degrees(math.acos(1)) else: deg = math.degrees(math.acos(cos)) return euclidean(v1, v2), cos, deg, canberra(v1, v2), correlation(v1, v2)
def get_distance_features(data, emb): data['cosine_distance'] = pd.Series([cosine(x, y) for x, y in emb]) data['cityblock_distance'] = pd.Series([cityblock(x, y) for x, y in emb]) data['jaccard_distance'] = pd.Series([jaccard(x, y) for x, y in emb]) data['canberra_distance'] = pd.Series([canberra(x, y) for x, y in emb]) data['euclidean_distance'] = pd.Series([euclidean(x, y) for x, y in emb]) data['minkowski_distance'] = pd.Series( [minkowski(x, y, 3) for x, y in emb]) data['braycurtis_distance'] = pd.Series([braycurtis(x, y) for x, y in emb]) return data
def canberra_dist(user_predict, adoptable_dogs, images): ''' Calculating Canberra distance between two 1D arrays and return similiarty score ''' sim_score = [] for idx in range(0, len(adoptable_dogs)): sim_score.append( distance.canberra(user_predict.flatten(), adoptable_dogs[idx].flatten())) print('Maximum SimScore: ' + str(max(sim_score))) return pd.DataFrame({'imgFile': images, 'SimScore': sim_score})
def feature_construct(city, model_name, friends, walk_len=100, walk_times=20, num_features=128): '''construct the feature matrixu2_checkin Args: city: city model_name: 20_locid friends: friends list (asymetric) [u1, u2] walk_len: walk length walk_times: walk times num_features: dimension for vector Returns: ''' if os.path.exists('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature'): os.remove('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature') emb = pd.read_csv('dataset/'+city+'/emb/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.emb',\ header=None, skiprows=1, sep=' ') emb = emb.rename(columns={0: 'uid'}) # last column is user id emb = emb.loc[emb.uid > 0] # only take users, no loc_type, not necessary pair = pair_construct(emb.uid.unique(), friends) for i in range(len(pair)): u1 = pair.loc[i, 'u1'] u2 = pair.loc[i, 'u2'] label = pair.loc[i, 'label'] u1_vector = emb.loc[emb.uid == u1, range(1, emb.shape[1])] u2_vector = emb.loc[emb.uid == u2, range(1, emb.shape[1])] i_feature = pd.DataFrame([[ u1, u2, label, cosine(u1_vector, u2_vector), euclidean(u1_vector, u2_vector), correlation(u1_vector, u2_vector), chebyshev(u1_vector, u2_vector), braycurtis(u1_vector, u2_vector), canberra(u1_vector, u2_vector), cityblock(u1_vector, u2_vector), sqeuclidean(u1_vector, u2_vector) ]]) i_feature.to_csv('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature',\ index = False, header = None, mode = 'a')
def plt_compare(one, two): hists = read_json() hist1 = hists[str(one)] hist2 = hists[str(two)] plt.plot(hist1, color='red') similarity = canberra(hist1, hist2) print(f'length : {len(hist2)}') print(f'similarity : {similarity}') # plt.figure() plt.plot(hist2) plt.show()
def calc_ROSA(fea0, fea1): ''' Calculating ROSA values of fea0 and fea1, where fea0 is the features of observation, and fea1 is the features of forecast. ''' value = {} #R value['r_rate'] = dr(fea0['average_rainfall'], fea1['average_rainfall']) #O value['x_offset'] = fea1['x_c'] - fea0['x_c'] value['y_offset'] = fea1['y_c'] - fea0['y_c'] #S value['sigma_x_rate'] = dr(fea0['sigma_x'], fea1['sigma_x']) value['sigma_y_rate'] = dr(fea0['sigma_y'], fea1['sigma_y']) value['ecc'] = fea1['eccentricity'] - fea0['eccentricity'] value['Hu_dist'] = distance.canberra(fea1['Hu'], fea0['Hu']) # Canberra distance # Other distances can be tried and tested. #A if (fea0['eccentricity'] < MIN_E) or (fea1['eccentricity'] < MIN_E): value['angle'] = 0. else: v0 = fea0['major_axis'] v1 = fea1['major_axis'] theta = numpy.dot(v0, v1)/(numpy.linalg.norm(v0)*numpy.linalg.norm(v1)) theta = numpy.arccos(theta)/numpy.pi*180 sgn = numpy.sign(numpy.cross(v0, v1)) value['angle'] = sgn*theta # The angle between two straight lines should be # bounded between -90 and +90. if value['angle'] > 90: value['angle'] = value['angle'] - 180 elif value['angle'] < -90: value['angle'] = value['angle'] + 180 return value
all_questions=[userinput]+all_questions all_answers=[d['answer'] for d in data] all_answers=[userinput]+all_answers QuestionTVectorArray=QTvectorizer.fit_transform(all_questions) AnswerTVectorArray=ATvectorizer.fit_transform(all_answers) #print "question cosine similairity-->",cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) #print "answer cosine similarity-->",cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qcosines=cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) Acosines=cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qbray=[dist.braycurtis(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Abray=[dist.braycurtis(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcanberra=[dist.canberra(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acanberra=[dist.canberra(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qhamming=[dist.hamming(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Ahamming=[dist.hamming(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcorrelation=[dist.correlation(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acorrelation=[dist.correlation(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcityblock=[dist.cityblock(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acityblock=[dist.cityblock(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qdice=[dist.dice(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Adice=[dist.dice(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qyule=[dist.yule(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
def pairwise_compare(signature_vectors): print "Pairwise Comparison of graphs ... " for i in range(0, len(signature_vectors)): for j in range(i+1, len(signature_vectors)): print "\t Distance between graph", i, "and graph ", j, dis.canberra(signature_vectors[i],signature_vectors[j])
question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
def wvCanb(a): return [distance.canberra(x[0], x[1]) for x in a]
def canberra((x, y)): return distance.canberra(x, y)