def fit_transform(self, df, k=100): ''' INPUT: pandas dataframe, int OUTPUT: pandas dataframe Fits internal variables used to fill missing values. Returns transformed data ready for machine learning models. ''' if self.method == 'mode': self.fit_most_common_values(df) elif self.method == 'kmeans': self.max_age = max(df['Age'].values) self.max_delta_h = max(df['Height (hh)']) - min(df['Height (hh)']) print self.max_age, max(df['Height (hh)']), min(df['Height (hh)']) kmeans = Kmeans_DF(df, k, self.horse_horse_distance) self.centroids = kmeans.get_centroids() print 'centroids fitted...' return self.transform(df)
N = 8. for col in ['Breed', 'Color', 'Pedigree', 'Sex']: if (pd.isnull(series1[col])) or (pd.isnull(series2[col])): N -= 1 elif series1[col] != series2[col]: dist += 1 if not (pd.isnull(series1['Height (hh)']) or pd.isnull(series2['Height (hh)'])): dist += abs(series1['Height (hh)'] - series2['Height (hh)'])\ / max_delta_h else: N -= 1 if not (pd.isnull(series1['Temperament']) or pd.isnull(series2['Temperament'])): dist += abs(series1['Temperament'] - series2['Temperament']) else: N -= 1 if not (pd.isnull(series1['Age']) or pd.isnull(series2['Age'])): dist += abs(series1['Age'] - series2['Age']) / max_age else: N -= 1 return (dist / N - 1. / 8) * 8. / 7 if __name__ == '__main__': for k in [50, 100, 200]: filename = 'Centroids_' + str(k) + '.json' kmeans = Kmeans_DF(df, k, horse_horse_distance) centroids = kmeans.get_centroids() centroids.to_json(filename) print(str(k) + ' is done...')