def __calc_distances__(self, v1s, v2s, is_sparse=True): if is_sparse: dcosine = np.array([cosine(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x.toarray(), y.toarray(), 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x.toarray(), y.toarray()) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x.toarray().ravel()) for x in v1s] dskew_q2 = [skew(x.toarray().ravel()) for x in v2s] dkur_q1 = [kurtosis(x.toarray().ravel()) for x in v1s] dkur_q2 = [kurtosis(x.toarray().ravel()) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) else: dcosine = np.array([cosine(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcityblock = np.array([cityblock(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dcanberra = np.array([canberra(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) deuclidean = np.array([euclidean(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dminkowski = np.array([minkowski(x, y, 3) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dbraycurtis = np.array([braycurtis(x, y) for (x, y) in zip(v1s, v2s)]).reshape((-1,1)) dskew_q1 = [skew(x) for x in v1s] dskew_q2 = [skew(x) for x in v2s] dkur_q1 = [kurtosis(x) for x in v1s] dkur_q2 = [kurtosis(x) for x in v2s] dskew_diff = np.abs(np.array(dskew_q1) - np.array(dskew_q2)).reshape((-1,1)) dkur_diff = np.abs(np.array(dkur_q1) - np.array(dkur_q2)).reshape((-1,1)) return np.hstack((dcosine,dcityblock,dcanberra,deuclidean,dminkowski,dbraycurtis,dskew_diff,dkur_diff))
def calculateL2(self, feat1, feat2, c_type='euclidean'): assert np.shape(feat1) == np.shape(feat2) if config.insight: [ len_, ] = np.shape(feat1) #print(np.shape(feat1)) else: _, len_ = np.shape(feat1) #print("len ",len_) if c_type == "cosine": s_d = distance.cosine(feat1, feat2) elif c_type == "euclidean": #s_d = np.sqrt(np.sum(np.square(feat1-feat2))) #s_d = distance.euclidean(feat1,feat2,w=1./len_) s_d = distance.euclidean(feat1, feat2, w=1) elif c_type == "correlation": s_d = distance.correlation(feat1, feat2) elif c_type == "braycurtis": s_d = distance.braycurtis(feat1, feat2) elif c_type == 'canberra': s_d = distance.canberra(feat1, feat2) elif c_type == "chebyshev": s_d = distance.chebyshev(feat1, feat2) return s_d
def get_braycurtis( input_file: pathlib.Path, db_file: pathlib.Path, db_name: str, level_str: str, threshold: int, ): levels = level_str.split(",") input_table = pd.read_csv(input_file) db_data = load_table(str(db_file)) db_df = db_data.to_dataframe(dense=True) samples = list(db_df.columns) obs_metadata = db_data.metadata_to_dataframe(axis="observation") db_table = pd.concat([obs_metadata, db_df], axis=1) data = [] print(f"Calculating braycurtis dissimilarity for {db_name}") for level in levels: for u, v, otu_ids, col in get_vectors(input_table, db_table, level, samples, threshold): value = braycurtis(u, v) data.append({ "database": db_name, "sample": col, "tax_level": level, "braycurtis": value, }) return data
def step(self, action=None): """ Render HTML and return state, reward, done for each step :param action: :return: """ # print(self.idx) if action is None: action = self.action_sample() self.html_vec[self.idx] = action if self.html_vec[:3] == [2, 1, 3]: print('HTML vec: ', self.html_vec) html = self.html_covr.convert( self.html_vec, direction=HTML2VECConverter.VEC2HTML_DIRECTION) html = self.fill_text_for_html(html) state = self.renderer.render_html(html) / 255.0 dist = distance.braycurtis(self.result_image.flatten(), state.flatten()) reward = HTMLGame.REWARD if dist < 1e-6 else 0 if set([2, 1, 3]) < set(self.html_vec): reward = HTMLGame.REWARD / 2.0 if set([4, 1, 5]) < set(self.html_vec): reward = HTMLGame.REWARD / 2.0 # reward = HTMLGame.REWARD if self.html_vec == [2, 1, 3, 4, 1, 5] else 0 self.idx += 1 done = False if reward == HTMLGame.REWARD: done = True return state, np.array([ np.identity(6)[v:v + 1] for v in self.html_vec ]).flatten(), reward, done
def Dist(array1, array2, dist): if dist == 'braycurtis': return distance.braycurtis(array1, array2) elif dist == 'correlation': return distance.correlation(array1, array2) elif dist == 'mahalanobis': return distance.mahalanobis(array1, array2) elif dist == 'minkowski': return distance.minkowski(array1, array2) elif dist == 'seuclidean': return distance.seuclidean(array1, array2) elif dist == 'sqeuclidean': return distance.sqeuclidean(array1, array2) elif dist == 'pearsonp': r, p = pearsonr(array1, array2) return p elif dist == 'pearsonr': r, p = pearsonr(array1, array2) return r elif dist == 'spearmanp': r, p = spearmanr(array1, array2) return p elif dist == 'spearmanr': r, p = spearmanr(array1, array2) return r
def calculate_featureset4(dataframe, q1_vectors, q2_vectors): dataframe['cosine_dist'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['cityblock_dist'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['jaccard_dist'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['canberra_dist'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['euclidean_dist'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['minkowski_dist'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['braycurtis_dist'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(q1_vectors), np.nan_to_num(q2_vectors)) ] dataframe['skew_q1'] = [skew(x) for x in np.nan_to_num(q1_vectors)] dataframe['skew_q2'] = [skew(x) for x in np.nan_to_num(q2_vectors)] dataframe['kurtosis_q1'] = [kurtosis(x) for x in np.nan_to_num(q1_vectors)] dataframe['kurtosis_q2'] = [kurtosis(x) for x in np.nan_to_num(q2_vectors)] return dataframe
def compare_locations(c1, c2, method='Average'): rssi1 = [] rssi2 = [] wifi1 = c1['fingerprints']['wifi'] wifi2 = c2['fingerprints']['wifi'] common_aps = list(set(wifi1.keys()) & set(wifi2.keys())) # No APs in common -> similarity = 1 if not common_aps: return 1 # TODO: find the best metric # If not enough common APs -> similarity = 1 if len(common_aps) * 10 < len(wifi1.keys()): return 1 for ap in common_aps: # Take only the first RSSI value if method == 'First': rssi1.append(wifi1[ap]['rssi'][0]) rssi2.append(wifi2[ap]['rssi'][0]) # Make an average of all RSSI values if method == 'Average': rssi1.append(np.average(wifi1[ap]['rssi'])) rssi2.append(np.average(wifi2[ap]['rssi'])) return braycurtis(tuple(rssi1), tuple(rssi2))
def braycurtis(x, y): try: return distance.braycurtis(x, y) except ValueError: return np.NaN except: return np.NaN
def extend_with_features(data): stop_words = stopwords.words('english') data['fuzz_qratio'] = data.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1) data['fuzz_WRatio'] = data.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1) model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) data['wmd'] = data.apply( lambda x: wmd(model, x['question1'], x['question2']), axis=1) norm_model = gensim.models.KeyedVectors.load_word2vec_format( google_news_model_path, binary=True) norm_model.init_sims(replace=True) data['norm_wmd'] = data.apply( lambda x: norm_wmd(norm_model, x['question1'], x['question2']), axis=1) question1_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question1.values): question1_vectors[i, :] = sent2vec(model, q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in enumerate(data.question2.values): question2_vectors[i, :] = sent2vec(model, q) question1_vectors = np.nan_to_num(question1_vectors) question2_vectors = np.nan_to_num(question2_vectors) data['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(question1_vectors, question2_vectors) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(question1_vectors, question2_vectors) ] data['skew_q1vec'] = [skew(x) for x in question1_vectors] data['skew_q2vec'] = [skew(x) for x in question2_vectors] data['kur_q1vec'] = [kurtosis(x) for x in question1_vectors] data['kur_q2vec'] = [kurtosis(x) for x in question2_vectors] return data
def feature3(data): question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q) data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] return data
def calculate_distance(self, method, histA, histB): """ use chi by default """ if method == "braycurtis": return braycurtis(histA, histB) elif method == "intersection": return self.intersection(histA, histB) return self.chi2_distance(histA, histB)
def vectors_features(in_data: pd.DataFrame, sent2vec: Callable[[str], np.array]) -> pd.DataFrame: assert "question1" in in_data.columns assert "question2" in in_data.columns vectors1 = np.array([sent2vec(x) for x in in_data['question1']]) vectors2 = np.array([sent2vec(x) for x in in_data['question2']]) in_data['cos'] = np.array( [cosine(x, y) for x, y in zip(vectors1, vectors2)]) in_data['jaccard'] = np.array( [jaccard(x, y) for x, y in zip(vectors1, vectors2)]) in_data['euclidean'] = np.array( [euclidean(x, y) for x, y in zip(vectors1, vectors2)]) in_data['minkowski'] = np.array( [minkowski(x, y) for x, y in zip(vectors1, vectors2)]) in_data['cityblock'] = np.array( [cityblock(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['canberra'] = np.array( [canberra(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['braycurtis'] = np.array( [braycurtis(x, y) for (x, y) in zip(vectors1, vectors2)]) in_data['skew_q1'] = np.array([skew(x) for x in vectors1]) in_data['skew_q2'] = np.array([skew(x) for x in vectors2]) in_data['kur_q1'] = np.array([kurtosis(x) for x in vectors1]) in_data['kur_q2'] = np.array([kurtosis(x) for x in vectors2]) in_data['skew_diff'] = np.abs(in_data['skew_q1'] - in_data['skew_q2']) in_data['kur_diff'] = np.abs(in_data['kur_q1'] - in_data['kur_q2']) return in_data
def distance_features(data,genismModel): w2v_q1 = np.array([sent2vec(q, genismModel) for q in data.question1]) w2v_q2 = np.array([sent2vec(q, genismModel) for q in data.question2]) a=np.zeros(300) for i in range(len(w2v_q1)): if w2v_q1[i].size==1: w2v_q1[i]=a for i in range(len(w2v_q2)): if w2v_q2[i].size==1: w2v_q2[i]=a data['cosine_distance'] = [cosine(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['cityblock_distance'] = [cityblock(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['jaccard_distance'] = [jaccard(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['canberra_distance'] = [canberra(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['euclidean_distance'] = [euclidean(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['minkowski_distance'] = [minkowski(x,y,3) for (x,y) in zip(w2v_q1, w2v_q2)] data['braycurtis_distance'] = [braycurtis(x,y) for (x,y) in zip(w2v_q1, w2v_q2)] data['skew_q1vec'] = [skew(x) for x in w2v_q1] data['skew_q2vec'] = [skew(x) for x in w2v_q2] data['kur_q1vec'] = [kurtosis(x) for x in w2v_q1] data['kur_q2vec'] = [kurtosis(x) for x in w2v_q2] fs_4 = ['cosine_distance', 'cityblock_distance', 'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance','braycurtis_distance','skew_q1vec', 'skew_q2vec','kur_q1vec','kur_q2vec'] return data,fs_4
def get_w2v_simi(query, title): q_vec = np.nan_to_num(sent2vec(query)) t_vec = np.nan_to_num(sent2vec(title)) w2v_consine = cosine(q_vec, t_vec) w2v_cityblock = cityblock(q_vec, t_vec) w2v_jaccard = jaccard(q_vec, t_vec) w2v_canberra = canberra(q_vec, t_vec) w2v_euclidean = euclidean(q_vec, t_vec) w2v_minkowski = minkowski(q_vec, t_vec) w2v_braycurtis = braycurtis(q_vec, t_vec) w2v_skew_qvec = skew(q_vec) w2v_skew_tvec = skew(t_vec) w2v_kur_qvec = kurtosis(q_vec) w2v_kur_tvec = kurtosis(t_vec) outlist = [w2v_consine, w2v_cityblock, w2v_jaccard, w2v_canberra, w2v_euclidean, w2v_minkowski, w2v_braycurtis, w2v_skew_qvec, w2v_skew_tvec, w2v_kur_qvec, w2v_kur_tvec ] outformat = ':'.join(['{}']*len(outlist)) return outformat.format(*outlist)
def calculate_distance(X, Y, metric='euclidean'): if metric == METRIC_EUCLIDEAN: return distance.euclidean(X, Y) elif metric == METRIC_JACCARD: return distance.jaccard(X, Y) elif metric == METRIC_CANBERRA: return distance.canberra(X, Y) elif metric == METRIC_CHEBYSHEV: return distance.chebyshev(X, Y) elif metric == METRIC_MINKOWSKI: return distance.minkowski(X, Y) elif metric == METRIC_WMINKOWSKI: return distance.wminkowski(X, Y) elif metric == METRIC_BRAYCURTIS: return distance.braycurtis(X, Y) elif metric == METRIC_HAMMING: return distance.hamming(X, Y) elif metric == METRIC_MAHALANOBIS: return distance.mahalanobis(X, Y) elif metric == METRIC_MANHATTAN: return sum(abs(a - b) for a, b in zip(X, Y)) elif metric == METRIC_COSINE: dot_product = np.dot(X, Y) norm_a = np.linalg.norm(X) norm_b = np.linalg.norm(Y) return dot_product / (norm_a * norm_b)
def _compute_per_level_accuracy(exp, obs, metadata, depth): results = [] vectors = {} for level in range(1, depth + 1): vectors[level] = {'exp': [], 'obs': []} # collapse taxonomy strings to level exp_collapsed = _collapse_table(exp, level) obs_collapsed = _collapse_table(obs, level) # compute stats for each sample individually for sample in obs_collapsed.index: result = [sample, level] # if metadata are passed, map exp sample ID to value in metadata if metadata is not None: exp_id = metadata[sample] else: exp_id = sample # concatenate obs/exp observations to align features joined_table = pd.concat( [exp_collapsed.loc[exp_id], obs_collapsed.loc[sample]], axis=1, sort=True).fillna(0) # split joined table apart again for computing stats exp_vector = joined_table.iloc[:, 0] obs_vector = joined_table.iloc[:, 1] exp_features = exp_vector[exp_vector != 0] obs_features = obs_vector[obs_vector != 0] # Count observed taxa observed_feature_count = len(obs_features) observed_feature_ratio = (observed_feature_count / len(exp_features)) result.extend([observed_feature_count, observed_feature_ratio]) # compute TAR/TDR result.extend(compute_taxon_accuracy(exp_features, obs_features)) # compute linear least-squares regression results if len(exp_vector) == len(obs_vector) == 1: # linear regression cannot compute if vector length < 2 reg_results = [np.nan] * 5 else: reg_results = linregress(exp_vector, obs_vector) result.extend(reg_results) # compute Bray-Curtis dissimilarity result.append(braycurtis(exp_vector, obs_vector)) # compute Jaccard distance, must convert to bool array result.append( jaccard(list(map(bool, exp_vector)), list(map(bool, obs_vector)))) results.append(result) # store vectors for constructing regplots vectors[level]['exp'].extend(exp_vector) vectors[level]['obs'].extend(obs_vector) results = pd.DataFrame(results, columns=[ 'sample', 'level', 'Observed Taxa', 'Observed / Expected Taxa', 'TAR', 'TDR', 'Slope', 'Intercept', 'r-value', 'P value', 'Std Err', 'Bray-Curtis', 'Jaccard' ]) results['r-squared'] = results['r-value']**2 return results, vectors
def features_similarity(cls, df): cls.load_model(normed=True) question1_vectors, question2_vectors = cls.get_questions_vector(df) cls.resetmodel() cls.dict_features['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("1/11 Cosine Distance finished.") cls.dict_features['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("2/11 Cityblock Distance finished.") cls.dict_features['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("3/11 Jaccard Distance finished.") cls.dict_features['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("4/11 Canberra Distance finished.") cls.dict_features['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("5/11 Euclidean Distance finished.") cls.dict_features['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("6/11 Minkowski Distance finished.") cls.dict_features['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] print("7/11 Braycurtis Distance finished.") cls.dict_features['skew_q1vec'] = [ skew(x) for x in np.nan_to_num(question1_vectors) ] print("8/11 Skew Q1 Vec finished.") cls.dict_features['skew_q2vec'] = [ skew(x) for x in np.nan_to_num(question2_vectors) ] print("9/11 Skew Q2 Vec finished.") cls.dict_features['kur_q1vec'] = [ kurtosis(x) for x in np.nan_to_num(question1_vectors) ] print("10/11 Kurtosis Q1 Vec finished.") cls.dict_features['kur_q2vec'] = [ kurtosis(x) for x in np.nan_to_num(question2_vectors) ] print("11/11 Kurtosis Q2 Vec finished.") return question1_vectors, question2_vectors
def compare_features(feature_array_1, feature_array_2, key): br = braycurtis(feature_array_1, feature_array_2) #concatenate features of two images and reshape to two dimensional matrix features = np.hstack((feature_array_1, feature_array_2)) br_features = np.hstack((features, br)) proba = xgboost_model.predict_proba(br_features).tolist()[0] return {'name': key, 'different': proba[0], 'same': proba[1]}
def distance(self, u,v,distancemetric): if distancemetric is "cosine": return distance.cosine(u, v) elif distancemetric is "euclidean": return distance.euclidean(u, v) elif distancemetric is "cityblock": return distance.cityblock(u, v) elif distancemetric is "braycurtis": return distance.braycurtis(u, v)
def feats_tfidf(row): out_list = [] que1 = str(row['question1']) que2 = str(row['question2']) #Calculate que1 lsa vector que1_vec = [] que1_bow = dictionary.doc2bow(que1.lower().split()) que1_lsi = lsi[que1_bow] for (index, value) in que1_lsi: que1_vec.append(value) #Calculate que2 lsa vector que2_vec = [] que2_bow = dictionary.doc2bow(que2.lower().split()) que2_lsi = lsi[que2_bow] for (index, value) in que2_lsi: que2_vec.append(value) #drop some dimensions if they don't match if len(que1_vec) != len(que2_vec): if len(que1_vec) > len(que2_vec): que1_vec = que1_vec[:len(que2_vec)] que2_vec = que2_vec else: que1_vec = que1_vec que2_vec = que2_vec[:len(que1_vec)] #Calculate distances between lsa vectors try: lsa_cosine = cosine(que1_vec, que2_vec) except: lsa_cosine = 1 lsa_cityblock = cityblock(que1_vec, que2_vec) lsa_jaccard = jaccard(que1_vec, que2_vec) lsa_canberra = canberra(que1_vec, que2_vec) try: lsa_euclidean = euclidean(que1_vec, que2_vec) except: lsa_euclidean = np.nan lsa_minkowski = minkowski(que1_vec, que2_vec, 3) lsa_braycurtis = braycurtis(que1_vec, que2_vec) lsa_q1_skew = skew(que1_vec) lsa_q1_kurtosis = kurtosis(que1_vec) lsa_q2_skew = skew(que2_vec) lsa_q2_kurtosis = kurtosis(que2_vec) out_list.extend([lsa_cosine,lsa_cityblock,lsa_jaccard,lsa_canberra,lsa_euclidean, \ lsa_minkowski,lsa_braycurtis,lsa_q1_skew,lsa_q1_kurtosis,lsa_q2_skew, lsa_q2_kurtosis]) return out_list
def dist_features(data): model = gensim.models.KeyedVectors.load_word2vec_format( 'data/GoogleNews-vectors-negative300.bin.gz', binary=True) question1_vectors = np.zeros((data.shape[0], 300)) error_count = 0 for i, q in tqdm(enumerate(data.question1.values)): question1_vectors[i, :] = sent2vec(q, model) question2_vectors = np.zeros((data.shape[0], 300)) for i, q in tqdm(enumerate(data.question2.values)): question2_vectors[i, :] = sent2vec(q, model) data['cosine_distance'] = [ cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['cityblock_distance'] = [ cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['jaccard_distance'] = [ jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['canberra_distance'] = [ canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['euclidean_distance'] = [ euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['minkowski_distance'] = [ minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['braycurtis_distance'] = [ braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors)) ] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1) cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1) return data
def identifiability(sub_list, ses_list, gv_array, measure, ses1, ses2): ''' This function calculates the identifiability of subjects as I_diff=I_self-I_others where I_self is similarity between the same subject in two different sessions averaged over all subjects and I_others is similarity between a given subject and all the others in two different sessions averaged over all subjects. Input: sub_list - vector of subjects, ses_list - vector with session numbers, gv_array - array of shape (number of subjects * number of sessions) x (number of graph measures) measure - 'cosine' - cosine similarity, 'pearsonr' - Pearson correlation coefficient ses1, ses2 - numbers of sessions to compare (integers) Output: I_diff - identifiability (scalar). ''' ###--- Import packages from scipy.stats.stats import pearsonr from scipy.spatial.distance import cityblock, euclidean, minkowski, braycurtis ###--- Define cosine similarity between two vectors def dot(A,B): return (sum(a*b for a,b in zip(A,B))) def cosine_similarity(a,b): return dot(a,b) / ((dot(a,a)**.5) * (dot(b,b)**.5)) ###--- Find number of subjects and number of sessions N_ses = int(max(ses_list)) N_sub = (len(sub_list)) ###--- Calculate identifiability matrix I_mat = np.zeros((N_sub,N_sub)) if measure == 'euclidean': for sub1 in range(N_sub): for sub2 in range(N_sub): I_mat[int(sub1)-1,int(sub2)-1] = euclidean(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:]) elif measure == 'cityblock': for sub1 in range(N_sub): for sub2 in range(N_sub): I_mat[int(sub1)-1,int(sub2)-1] = cityblock(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:]) elif measure == 'braycurtis': for sub1 in range(N_sub): for sub2 in range(N_sub): I_mat[int(sub1)-1,int(sub2)-1] = braycurtis(gv_array[int(sub1)*N_ses+ses1-3,:],gv_array[int(sub2)*N_ses+ses2-3,:]) ###--- Create an out-of-diagonal elements mask out = np.ones((len(sub_complete),len(sub_complete)),dtype=bool) np.fill_diagonal(out,0) ###---Similarity of subject to others, averaged over all subjects I_others=np.mean(I_mat[out]) ###---Similarity of subject to himself, averaged over all subjects I_self = np.mean(np.diagonal(I_mat)) I_diff=I_self/I_others return I_diff
def compute_sim_bray(df, currentVec, simArray): for index, row in df.iterrows(): imageName = row[0] + '.jpg' brayDis = distance.braycurtis(currentVec, row[1:]) temp = pd.DataFrame([[imageName, brayDis]], columns=['ImageName', 'BrayCurtis']) simArray = simArray.append(temp, ignore_index=True) return simArray
def get_distance_features(data, emb): data['cosine_distance'] = pd.Series([cosine(x, y) for x, y in emb]) data['cityblock_distance'] = pd.Series([cityblock(x, y) for x, y in emb]) data['jaccard_distance'] = pd.Series([jaccard(x, y) for x, y in emb]) data['canberra_distance'] = pd.Series([canberra(x, y) for x, y in emb]) data['euclidean_distance'] = pd.Series([euclidean(x, y) for x, y in emb]) data['minkowski_distance'] = pd.Series( [minkowski(x, y, 3) for x, y in emb]) data['braycurtis_distance'] = pd.Series([braycurtis(x, y) for x, y in emb]) return data
def test_compare_braycurtis_definitions(): x = np.random.uniform(0, 10, 10) y = np.random.uniform(0, 10, 10) bc1 = braycurtis(x, y) bc2 = BrayCurtis(x, y) print("The Bray-Curtis distance of the scipy.spatial.distance package is:", bc1) print("The Bray-Curtis distance ( sum(abs(x-y)) / sum(x+y) ) is:", bc2) print("The difference between both definitions is ", bc1 - bc2)
def build_features(self, net, photo): size_arr = self.clients_features.shape[0] self.photo_features = net.get_features(photo) self.main_arr_broadcasted = broadcast_array(self.photo_features, size_arr) cls_arr = np.hstack((self.clients_features, self.main_arr_broadcasted)) braycurtis_dist = braycurtis(self.clients_features, self.main_arr_broadcasted).reshape( 1, size_arr) self.cls_arr_br = np.concatenate((cls_arr, braycurtis_dist.T), axis=1)
def bray_curtis_dist(user_predict, adoptable_dogs, images): ''' Calculating Bray-Curtis distance between two 1D arrays and return similarity score ''' sim_score = [] for idx in range(0, len(adoptable_dogs)): sim_score.append( distance.braycurtis(user_predict.flatten(), adoptable_dogs[idx].flatten())) print('Maximum SimScore: ' + str(max(sim_score))) return pd.DataFrame({'imgFile': images, 'SimScore': sim_score})
def feature_construct(city, model_name, friends, walk_len=100, walk_times=20, num_features=128): '''construct the feature matrixu2_checkin Args: city: city model_name: 20_locid friends: friends list (asymetric) [u1, u2] walk_len: walk length walk_times: walk times num_features: dimension for vector Returns: ''' if os.path.exists('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature'): os.remove('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature') emb = pd.read_csv('dataset/'+city+'/emb/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.emb',\ header=None, skiprows=1, sep=' ') emb = emb.rename(columns={0: 'uid'}) # last column is user id emb = emb.loc[emb.uid > 0] # only take users, no loc_type, not necessary pair = pair_construct(emb.uid.unique(), friends) for i in range(len(pair)): u1 = pair.loc[i, 'u1'] u2 = pair.loc[i, 'u2'] label = pair.loc[i, 'label'] u1_vector = emb.loc[emb.uid == u1, range(1, emb.shape[1])] u2_vector = emb.loc[emb.uid == u2, range(1, emb.shape[1])] i_feature = pd.DataFrame([[ u1, u2, label, cosine(u1_vector, u2_vector), euclidean(u1_vector, u2_vector), correlation(u1_vector, u2_vector), chebyshev(u1_vector, u2_vector), braycurtis(u1_vector, u2_vector), canberra(u1_vector, u2_vector), cityblock(u1_vector, u2_vector), sqeuclidean(u1_vector, u2_vector) ]]) i_feature.to_csv('dataset/'+city+'/feature/'+city+'_'+model_name+'_'+\ str(int(walk_len))+'_'+str(int(walk_times))+'_'+str(int(num_features))+'.feature',\ index = False, header = None, mode = 'a')
def make_prediction(self, img1, img2): image1_features = self.get_features(img1) image2_features = self.get_features(img2['base64']) br = braycurtis(image1_features, image2_features) #concatenate features of two images and reshape to two dimensional matrix features = np.hstack((image1_features, image2_features)) br_features = np.hstack((features, br)).reshape(1, 805) proba = self.xgboost_model.predict_proba(br_features).tolist()[0] return {'name': img2['name'], 'different': proba[0], 'same': proba[1]}
def run_on(self, df_run): if self.col1 not in dicts: self.dict1 = self.pd.read_csv(workdir+'dict_'+self.col1+'.csv', dtype={'value': object}).set_index('key')["value"].to_dict() else: self.dict1 = {v:k for k,v in dicts[self.col1].items()} # make key=number, value=string if self.col2 not in dicts: self.dict2 = self.pd.read_csv(workdir+'dict_'+self.col2+'.csv', dtype={'value': object}).set_index('key')["value"].to_dict() else: self.dict2 = {v:k for k,v in dicts[self.col2].items()} # make key=number, value=string self.dfx = self.pd.DataFrame() self.dfx[self.col1] = df_run[self.col1].map(self.dict1) self.dfx[self.col2] = df_run[self.col2].map(self.dict2) block = int(len(df_run)/50) i = 0 for index, row in self.dfx.iterrows(): i+=1 if type(row[self.col1])==str: sline1 = self.func(row[self.col1]) else: sline1 = '' if type(row[self.col2])==str: sline2 = self.func(row[self.col2]) else: sline2 = '' wta = word_tokenize(sline1.lower()) wtb = word_tokenize(sline2.lower()) s2v_a = self.sent2vec(wta) s2v_b = self.sent2vec(wtb) df_run.set_value(index, self.fldprefix + '_1', self.wmd(sline1, sline2)) df_run.set_value(index, self.fldprefix + '_2', self.norm_wmd(sline1, sline2)) df_run.set_value(index, self.fldprefix + '_3', cosine(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_4', cityblock(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_5', jaccard(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_6', canberra(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_7', euclidean(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_8', minkowski(s2v_a, s2v_b, 3)) df_run.set_value(index, self.fldprefix + '_9', braycurtis(s2v_a, s2v_b)) df_run.set_value(index, self.fldprefix + '_10', skew(s2v_a)) df_run.set_value(index, self.fldprefix + '_11', skew(s2v_b)) df_run.set_value(index, self.fldprefix + '_12', kurtosis(s2v_a)) df_run.set_value(index, self.fldprefix + '_13', kurtosis(s2v_b)) if i>=block and block>=1000: i=0 print (index) df_run[[self.fldprefix + '_3',self.fldprefix + '_5',self.fldprefix + '_9']]=df_run[[self.fldprefix + '_3',self.fldprefix + '_5',self.fldprefix + '_9']].fillna(value=1.0)
def compute_distance(net, img1, img2): id1 = get_scores(net, img1) id1 = np.mean(id1, axis=0) id1_norm = id1 / np.linalg.norm(id1) id2 = get_scores(net, img2) id2 = np.mean(id2, axis=0) id2_norm = id2 / np.linalg.norm(id2) comp_dist = ssd.braycurtis(id1_norm, id2_norm) print comp_dist dist_eucl = ssd.euclidean(id1_norm, id2_norm) dist_cosine = ssd.cosine(id1_norm, id2_norm) return comp_dist, dist_cosine, dist_eucl
def score_braycurtis(self, term1, term2, **kwargs): """ Compute a weighting score based on the "City Block" distance between the kernel density estimates of two terms. :param term1: The first term. :param term2: The second term. """ t1_kde = self.kde(term1, **kwargs) t2_kde = self.kde(term2, **kwargs) return 1-distance.braycurtis(t1_kde, t2_kde)
def BrayCurtis(X): ''' compute Bray-Curtis dissimilarity. Args: X: input N x K data matrix. N ... the number of samples, K ... the number of features. Return: N x N data matrix. The value of (i,j) shows the distance between sample-i and sample-j. ''' from scipy.spatial.distance import braycurtis X = np.array(X) n_samples = X.shape[0] n_distance = n_samples * (n_samples - 1) / 2 d_array = np.zeros((n_distance)) for i, (idx1, idx2) in enumerate(itertools.combinations(range(n_samples),2)): d_array[i] = braycurtis(X[idx1], X[idx2]) return squareform(d_array)
def kde_best_match(self, n=500, show_matches=False, **kwargs): """ For each term in text 1, find the term in text 2 with the most similar pattern of distribution. Args: n (int): Consider N most-frequent words. show_matches (bool): Show identity (A -> A) matches. Returns: list: Tuples of (t1 term, t2 term, weight). """ mft1 = self.text1.most_frequent_terms(n) mft2 = self.text2.most_frequent_terms(n) # For each term in text 1. links = [] for t1 in mft1: # Score against each term in text 2. scores = [] for t2 in mft2: t1_kde = self.text1.kde(t1, **kwargs) t2_kde = self.text2.kde(t2, **kwargs) score = 1-distance.braycurtis(t1_kde, t2_kde) scores.append((t2, score)) # Get the nearest neighbor. scores = sorted(scores, key=lambda x: x[1], reverse=True) t2 = scores[0][0] if show_matches or t1 != t2: links.append(( self.text1.unstem(t1), self.text2.unstem(t2), scores[0][1] )) # Sort strongest -> weakest. links = sorted(links, key=lambda x: x[2], reverse=True) return links
def score_braycurtis(self, term1, term2, **kwargs): """ Compute a weighting score based on the "City Block" distance between the kernel density estimates of two terms. Args: term1 (str) term2 (str) Returns: float """ t1_kde = self.kde(term1, **kwargs) t2_kde = self.kde(term2, **kwargs) return 1-distance.braycurtis(t1_kde, t2_kde)
def wvBray(a): return [distance.braycurtis(x[0], x[1]) for x in a]
def metric_braycurtis_2(i, j): return dist.braycurtis(i, j)
ATvectorizer=TfidfVectorizer() all_questions=[d['question'] for d in data] all_questions=[userinput]+all_questions all_answers=[d['answer'] for d in data] all_answers=[userinput]+all_answers QuestionTVectorArray=QTvectorizer.fit_transform(all_questions) AnswerTVectorArray=ATvectorizer.fit_transform(all_answers) #print "question cosine similairity-->",cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) #print "answer cosine similarity-->",cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qcosines=cosine_similarity(QuestionTVectorArray[0:1],QuestionTVectorArray) Acosines=cosine_similarity(AnswerTVectorArray[0:1],AnswerTVectorArray) Qbray=[dist.braycurtis(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Abray=[dist.braycurtis(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcanberra=[dist.canberra(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acanberra=[dist.canberra(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qhamming=[dist.hamming(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Ahamming=[dist.hamming(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcorrelation=[dist.correlation(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acorrelation=[dist.correlation(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qcityblock=[dist.cityblock(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray] Acityblock=[dist.cityblock(AnswerTVectorArray[0].toarray(),u.toarray()) for u in AnswerTVectorArray] Qdice=[dist.dice(QuestionTVectorArray[0].toarray(),u.toarray()) for u in QuestionTVectorArray]
def braycurtis((x, y)): return distance.braycurtis(x, y)
data['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] data['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] data['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] data['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] data['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)] cPickle.dump(question1_vectors, open('data/q1_w2v.pkl', 'wb'), -1) cPickle.dump(question2_vectors, open('data/q2_w2v.pkl', 'wb'), -1) data.to_csv('data/quora_features.csv', index=False)
def test(lst1,lst2): print("PEARSON: ", str(pearsonr(lst1, lst2))) print("SPEARMAN: ", str(spearmanr(lst1, lst2))) print("BRAYCURTIS: ", str(braycurtis(lst1, lst2))) print("KULLMANLEIBER: ", str(entropy(lst1, lst2)))