def evaluate_model(model, model_type, num_of_classes, candidate_feature_set, data_set): '''This method uses the inputted feature subset to cluster the inputted data and scores performance using a LDA-like objective function.''' # Convert candidate_feature_set representation from # f_1, ... f_d to the list of indices of the f_i = 1 # (for example, [1 0 0 1 0] -> [0 3] candidate_feature_set = \ [idx for idx in xrange(len(candidate_feature_set)) if candidate_feature_set[idx] == 1] if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) model.cluster(data_set[:,candidate_feature_set]) return model.calculate_performance()
def perform_SFS_feature_selection(model, model_type, num_of_classes, data_set): # Create a boolean string, 1 = include feature, 0 = leave it out feature_set = [i for i in xrange(data_set.shape[1])] chosen_features = [] chosen_clusters = [] base_performance = float("-inf") # while there are still features to choose from... while len(feature_set) > 0: # initialize performance metrics best_performance = float("-inf") best_clusters = [] #print "best performance = %f" % best_performance # Pick a feature that hasn't be chosen yet and train the model for feature in feature_set: chosen_features.append(feature) # Train model if model_type == "Kmeans": model = KMeans(num_of_classes) elif model_type == "HAC": model = HAC(num_of_classes) #print "Modeling with %s" % chosen_features clusters = model.cluster(data_set) # Calculate performance via LDA-like objective function current_performance = model.calculate_performance() #print "model performance = %f" % current_performance # if this combo of features beats the best performance so far # take note... if current_performance > best_performance: best_performance = current_performance best_feature = feature best_clusters = clusters #print "best performance updated to %f" % best_performance chosen_features.remove(feature) # If best noted performance beats the best performance we've seen # so far, add to chosen features if best_performance > base_performance: base_performance = best_performance feature_set.remove(best_feature) chosen_features.append(best_feature) chosen_clusters = best_clusters #print "base performance = %f" % base_performance else: #print "best performance = %f" % base_performance break return chosen_features, chosen_clusters
data_file = open(test[0]) print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1,3]) kmeans_model.cluster(data_instances[:,kmeans_sfs_glass]) print "Kmeans SFS glass performance = %f" % kmeans_model.calculate_performance() kmeans_ga_glass = np.array([0,1,2,3,4,5,6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_ga_glass]) print "Kmeans GA glass performance = %f" % kmeans_model.calculate_performance() hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:,hac_sfs_glass]) print "HAC SFS glass performance = %f" % hac_model.calculate_performance() # Iris dataset elif "iris" in test[0]: kmeans_sfs_iris = np.array([1]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:,kmeans_sfs_iris])
print "Running with %s" % test[0] for line in data_file: line_split = line.split(',') data_instances.append(map(float, line_split)) data_instances = np.array(data_instances) # Run SFS using k-means and HAC kmeans_model = KMeans(test[1]) hac_model = HAC(test[1]) # Glass dataset if "glass" in test[0]: kmeans_sfs_glass = np.array([1, 3]) kmeans_model.cluster(data_instances[:, kmeans_sfs_glass]) print("K-means SFS glass performance = %f" % kmeans_model.calculate_performance()) kmeans_ga_glass = np.array([0, 1, 2, 3, 4, 5, 6]) kmeans_model = KMeans(test[1]) kmeans_model.cluster(data_instances[:, kmeans_ga_glass]) print("K-means GA glass performance = %f" % kmeans_model.calculate_performance()) hac_sfs_glass = np.array([0]) hac_model.cluster(data_instances[:, hac_sfs_glass]) print("HAC SFS glass performance = %f" % hac_model.calculate_performance()) # Iris dataset elif "iris" in test[0]: kmeans_sfs_iris = np.array([1])