/
gower_dist.py
30 lines (23 loc) · 1.24 KB
/
gower_dist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import pandas as pd
import numpy as np
from sklearn.neighbors import DistanceMetric
def gower_distance(X):
"""
This function expects a pandas dataframe as input
The data frame is to contain the features along the columns. Based on these features a
distance matrix will be returned which will contain the pairwise gower distance between the rows
All variables of object type will be treated as nominal variables and the others will be treated as
numeric variables.
Distance metrics used for:
Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
"""
individual_variable_distances = []
for i in range(X.shape[1]):
feature = X.iloc[:,[i]]
if feature.dtypes[0] == np.object:
feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
else:
feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values)
individual_variable_distances.append(feature_dist)
return np.array(individual_variable_distances).mean(0)