Cluster data using k-means ''' # whiten the observations data_w = vq.whiten(data) # create the classifier object kmeans, labels = vq.kmeans2(data_w, k=4, iter=30) # fit the data return kmeans, labels # the file name of the dataset r_filename = '../../Data/Chapter04/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # select variables selected = csv_read[[ 'n_duration', 'n_nr_employed', 'prev_ctc_outcome_success', 'n_euribor3m', 'n_cons_conf_idx', 'n_age', 'month_oct', 'n_cons_price_idx', 'edu_university_degree', 'n_pdays', 'dow_mon', 'job_student', 'job_technician', 'job_housemaid', 'edu_basic_6y' ]] # cluster the data centroids, labels = findClusters_kmeans(selected.as_matrix()) hlp.printClustersSummary(selected, labels, centroids)
meanShift = cl.MeanShift( bandwidth=bandwidth, bin_seeding=True ) # fit the data return meanShift.fit(data) # the file name of the dataset r_filename = '../../Data/Chapter04/bank_contacts.csv' # read the data csv_read = pd.read_csv(r_filename) # select variables selected = csv_read[['n_duration','n_nr_employed', 'prev_ctc_outcome_success','n_euribor3m', 'n_cons_conf_idx','n_age','month_oct', 'n_cons_price_idx','edu_university_degree','n_pdays', 'dow_mon','job_student','job_technician', 'job_housemaid','edu_basic_6y']] # cluster the data cluster = findClusters_meanShift(selected.as_matrix()) # assess the clusters effectiveness labels = cluster.labels_ centroids = cluster.cluster_centers_ hlp.printClustersSummary(selected, labels, centroids)