def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"): # generate 2d classification dataset if (type_data == "blobs"): X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features) elif(type_data == "moons"): X, y = make_moons(n_samples=n_samples, noise=0.1) elif(type_data == "circles"): X, y = make_circles(n_samples=n_samples, noise=0.05) # scatter plot, dots colored by class value # df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) # colors = {0:'red', 1:'blue', 2:'green'} # fig, ax = pyplot.subplots() # grouped = df.groupby('label') # for key, group in grouped: # group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) # pyplot.show() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None) classes = np.unique(y_train) if(0): enc = OneHotEncoder().fit(classes.reshape(-1,1)) y_train = enc.transform(y_train.reshape(-1, 1)) print (y_test) y_test = enc.transform(y_test.reshape(-1, 1)) print (y_test) y_train = one_hot_encode(y_train, classes) y_test = one_hot_encode(y_test, classes) return X_train, y_train, X_test, y_test, classes
def __init__(self): self.start_time = time.time() if (Configuration.data == "Social_Network_Ads.csv"): self.dataset = pd.read_csv(str(Configuration.data)) if (Configuration.algorithm == "linear_regression"): self.X = self.dataset.iloc[:, :-1].values self.y = self.dataset.iloc[:, 1].values elif (Configuration.algorithm == "logistic_regression" or Configuration.algorithm == "svc" or Configuration.algorithm == "decision_tree_classification" or Configuration.algorithm == "random_forest_classification" or Configuration.algorithm == "knn"): if (Configuration.data=="Social_Network_Ads.csv"): self.X = self.dataset.iloc[:, [2,3]].values self.y = self.dataset.iloc[:, 4].values else: if (Configuration.data == "moons"): from sklearn.datasets.samples_generator import make_moons self.X, self.y = make_moons(100, noise=.2, random_state = 0) elif (Configuration.data == "circles"): from sklearn.datasets.samples_generator import make_circles self.X, self.y = make_circles(100, factor=.5, noise=.1, random_state = 0) elif (Configuration.algorithm == "polynomial_regression"): self.X = self.dataset.iloc[:, 1:2].values self.y = self.dataset.iloc[:, 2].values elif (Configuration.algorithm == "kmeans"): self.X = self.dataset.iloc[:, [3, 4]].values self.y = None if (Configuration.data == "Social_Network_Ads.csv"): self.directory = "SocialNetworkAds" elif (Configuration.data == "moons"): self.directory = "Moons" elif (Configuration.data == "circles"): self.directory = "Circles"
def qa2(): print("khan") X,y =make_moons(n_samples = 300,noise = 0.05) print(y) #Plot Elbow Graph to find Minimal Optimal Cluisetr wcss = [] for i in range(1,11): kmeans1 = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300,n_init = 10,random_state = 0) kmeans1.fit(X) wcss.append(kmeans1.inertia_) """ plt.plot(range(1,11), wcss) plt.title("Elbow Method to find minimal No: Cluster") plt.xlabel("Number of cluster") plt.ylabel("Wcss") """ # plt.show() #Now Make Model kMeanModel = KMeans(n_clusters = 4) kMeanModel.fit(X) predictVal = kMeanModel.predict(X) print("****Predict Value*****") print(predictVal) fobj = plt.figure(figsize = (6,6), facecolor = (1,0,1)) fobj.canvas.set_window_title('Plot Diagram') spobj1=fobj.add_subplot(221) spobj1.scatter(range(1,11), wcss) spobj2=fobj.add_subplot(223) spobj2.scatter(X[:,0],X[:,1], c=predictVal, cmap = 'viridis') plt.show()
def plot_class(clf): #X_train, y_train = make_blobs(n_samples=200, centers=2, # random_state=2, cluster_std=2.50) X_train, y_train = make_moons(200, noise=0.20) fig, ax = plt.subplots(1, 1, figsize=(8, 6)) ax.set_xlabel('feature 1', color='gray') ax.set_ylabel('feature 2', color='gray') ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=100, cmap='Paired', zorder=3) xlim = ax.get_xlim() ylim = ax.get_ylim() x = np.linspace(xlim[0], xlim[1], 50) y = np.linspace(ylim[0], ylim[1], 50) yy, xx = np.meshgrid(y, x) X_test = np.vstack([xx.ravel(), yy.ravel()]).T clf.fit(X_train, y_train) zz = clf.predict(X_test) zz = zz.reshape(xx.shape) ax.contourf(xx, yy, zz, cmap='Paired', alpha=0.4, zorder=1) plt.show()
def generate_moons_sample(self): X, y = make_moons(n_samples=500, noise=0.08) y = (y - 0.5) * 2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) return X_train, X_test, y_train, y_test
def _get_moons(*args, **kwargs): X, y = make_moons(n_samples=100, noise=0.1) metadata = { 'regression': False, 'scoring': classifier_scoring, 'primary_metric': 'accuracy', } return X, y, metadata
def get_dataset(self): if self.type == 'moon': datas, labels = make_moons(n_samples=self.n_samples, noise=self.noise) elif self.type == 'circle': datas, labels = make_circles(n_samples=self.n_samples, noise=self.noise) else: print('wrong dataset type input.') dataset = {} dataset['data'] = datas dataset['target'] = labels return dataset
def choose_dataset(chosen_dataset, n_points): X = None if chosen_dataset == "blobs": X = make_blobs(n_samples=n_points, centers=4, n_features=2, cluster_std=1.5, random_state=42)[0] elif chosen_dataset == "moons": X = make_moons(n_samples=n_points, noise=0.05, random_state=42)[0] elif chosen_dataset == "scatter": X = make_blobs(n_samples=n_points, cluster_std=[2.5, 2.5, 2.5], random_state=42)[0] elif chosen_dataset == "circle": X = make_circles(n_samples=n_points, noise=0, random_state=42)[0] return X
def svm_kernal(): # creating dataset for checking kernal svm for binary classification X, y = make_moons(n_samples=100, noise=0.1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.transform(X_test) kernal_pegasos = Kernalpegasos(0.5) kernal_pegasos.fit(X_train, y_train, 100000, True, 'rbf') classifier = SVC(kernel='rbf', random_state=0, gamma=1, C=1) classifier.fit(X_train, y_train) # predicting using inbuilt kernal svm y_pred = classifier.predict(X_test) y_kernal_pred = kernal_pegasos.check_test_points(X_train, y_train, X_test, 'rbf') # print(y_pred) # print(y_kernal_pred) cm = confusion_matrix(y_test, y_pred) # print(cm) # print(classification_report(y_test, y_pred)) # print(classification_report(y_test, y_kernal_pred)) # plt.scatter(X[y == 1, 0], # X[y == 1, 1], # c='b', marker='x', # label='1') # plt.scatter(X[y == 0, 0], # X[y == 0, 1], # c='r', # marker='s', # label='0') # plt.xlim([-3, 3]) # plt.ylim([-3, 3]) # plt.legend(loc='best') # plt.tight_layout() plot_decision_regions(X, y, classifier=classifier) plt.legend(loc='upper left') plt.tight_layout() plt.show()
"""Prepare an ML model using KMeans algorithm to cluster some sample input generated using make_moon function. Plot the clusters. Also plot the same points by clustering it with Spectral Clustering Model. """ from sklearn.datasets.samples_generator import make_moons from sklearn.cluster import KMeans from sklearn.cluster import SpectralClustering import sklearn.metrics import matplotlib.pyplot as plt import pandas as pd import numpy as np X, y_true = make_moons(n_samples=300, noise=0.05) kmeans = KMeans(n_clusters=4) kmeans.fit(X) y_means = kmeans.predict(X) plt.scatter(X[:, 0], X[:, 1], s=50, c=y_means, cmap='viridis') #plt.show() model = SpectralClustering(2, affinity='nearest_neighbors') labels = model.fit_predict(X) plt.scatter(X[:, 0], X[:, 1], s=50, c=labels, cmap='viridis') plt.show()
import numpy as np import pandas as pd import matplotlib.pyplot as plt import tensorflow as tf print(tf.__version__) from sklearn.datasets.samples_generator import make_moons from sklearn.datasets.samples_generator import make_circles from sklearn.datasets.samples_generator import make_blobs # generate 2d classification dataset n = 10000 X, y = make_moons(n_samples=n, noise=0.1) # scatter plot, dots colored by class value df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) colors = {0:'red', 1:'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) plt.show() datadict = {'X1': X[:,0],'X2' : X[:,1], 'target': y} data = pd.DataFrame(data=datadict) X = data.iloc[:,[0, 1]].values type(X) y = data.target.values # TRAIN TEST SPLIT from sklearn.model_selection import train_test_split
""" print(__doc__) import numpy as np from sklearn.cluster import DBSCAN from sklearn import metrics from sklearn.datasets.samples_generator import make_moons from sklearn.preprocessing import StandardScaler # ############################################################################# # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_moons(n_samples=1000, noise=0.1) X = StandardScaler().fit_transform(X) # ############################################################################# # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # #############################################################################
#from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import KMeans from sklearn.datasets.samples_generator import make_moons from sklearn.cluster import SpectralClustering import matplotlib.pyplot as plt #X,y_true=make_blobs(n_samples=300, centers=4, cluster_std=0.6) X, y_true = make_moons(200, noise=0.05) #Simple KMeans kmeans = KMeans(2) kmeans.fit(X) y_means = kmeans.predict(X) plt.scatter(X[:, 0], X[:, 1], s=50, c=y_means) plt.show() #Spectral Clustering model = SpectralClustering(2, affinity='nearest_neighbors') #model=SpectralClustering(2,affinity='nearest_neighbors',assign_labels='kmeans') labels = model.fit_predict(X) #Plot graph plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') plt.show()
sns.lmplot('x', 'y', data=df, fit_reg=False) plt.show() # ----- make_blobs ----- from sklearn.datasets.samples_generator import make_blobs X, y = make_blobs(n_samples=200, centers=4, n_features=2, random_state=101) df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y)) sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False) plt.show() # ----- make_circles ----- from sklearn.datasets.samples_generator import make_circles X, y = make_circles(n_samples=200, noise=0.1, factor=0.5, random_state=101) df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y)) sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False) plt.show() # ----- make_moons ----- from sklearn.datasets.samples_generator import make_moons X, y = make_moons(n_samples=200, noise=0.1, random_state=101) df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y)) sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False) plt.show()
squared=True) order = distance.argmin(axis=0) plt.subplot(122) for k, col in zip(range(3), colors): my_members = k_means_3_labels == order[k] plt.scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] plt.scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8) plt.axis('equal') plt.title('KMeans 3') ''' #2: NON-SPHERICAL SHAPES ''' [X, true_labels] = make_moons(n_samples=1000, noise=.05) plt.figure(figsize=(12, 6)) plt.suptitle('Non-Spherical Shapes', fontsize=15) plt.subplot(121) for k, col in zip(range(2), colors): my_members = true_labels == k plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20) plt.axis('equal') plt.title('Original Data') # Compute clustering with 2 Clusters k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10) k_means_2.fit(X) k_means_2_labels = k_means_2.labels_
# you can change them when you are testing your solution but when submitting leave it in the original state n_samples = 50 C_const = 100 threshold = 1e-3 # generating (almost) linearly separable data, replacing 0 labels with -1 X_blob, Y_blob = make_blobs(n_samples=n_samples, centers=2, random_state=0, cluster_std=1.00) Y_blob[Y_blob == 0] = -1 # plt.scatter(X_blob[:, 0], X_blob[:, 1], c=Y_blob, s=50, cmap='autumn') # plt.show() minimize_and_plot(X_blob, Y_blob, linear_kernel, C_const, threshold) # svm with linear kernel minimize_and_plot(X_blob, Y_blob, polynomial_kernel, C_const, threshold) # svm with polynomial kernel # generating moon-shaped data, replacing 0 labels with -1 X_moon, Y_moon = make_moons(n_samples=n_samples, shuffle=False, noise=0.10, random_state=0) Y_moon[Y_moon == 0] = -1 # plt.scatter(X_moon[:, 0], X_moon[:, 1], c=Y_moon, s=50, cmap='autumn') # plt.show() minimize_and_plot(X_moon, Y_moon, linear_kernel, C_const, threshold) # svm with linear kernel minimize_and_plot(X_moon, Y_moon, polynomial_kernel, C_const, threshold) # svm with polynomial kernel
import networkx as nx # import warnings # warnings.filterwarnings("ignore") ################################################################################################## # # Generating Dataset # generate 2d classification dataset X, y = make_blobs(n_samples=120, centers=4, n_features=2, cluster_std=1.8, random_state=42) X1, y1 = make_moons(n_samples=80, noise=0.05, random_state=42) varied = make_blobs(n_samples=120, cluster_std=[3.5, 3.5, 3.5], random_state=42)[0] plt.scatter(varied[:, 0], varied[:, 1]) # plt.gcf().gca().add_artist(plt.Circle((-5, 0), 5, color="red", fill=False, linewidth=3, alpha=0.7)) plt.show() ################################################################################################## # OPTICS from algorithms.optics import OPTICS, plot_clust ClustDist, CoreDist = OPTICS(X, eps=0.5, minPTS=3, plot=True, plot_reach=True)
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_moons np.random.seed(42) data, labels = make_moons(n_samples=500, noise=0.1) colors = ['r' if y else 'b' for y in labels] print('data.shape =', data.shape, ', labels.shape =', labels.shape) plt.scatter(data[:, 0], data[:, 1], c=colors) # plt.show() def sigmoid(z): return 1 / (1 + np.exp(-z)) def logistic_regression(x, y, learning_rate, num_steps=40): ''' Input: x = the data y = the labels learning_rate = learning rate num_steps = number of iterations Output: w = the trained model weights ''' # Start by intializing the weights w with w_i = 1 for all i, and make it a 3x1 # column vector (numpy array). You can use the numpy function, ones. # YOUR CODE HERE w = np.ones((np.size(x, 1) + 1, 1)) # MODIFY THIS LINE! # Augment x with an initial column of ones for the bias term (the zeroth column of x). # You can use the numpy functions ones and hstack to accomplish this. # YOUR CODE HERE
def generate_two_moons(): [X, true_labels] = make_moons(n_samples=200, noise=.05) return X, true_labels
# makes sine wave #x_t = np.linspace(-1.5*np.pi, 1.5*np.pi, n_point_per_cluster) x_t = np.linspace(-20, -10, n_point_per_cluster) y_t = 10 + 5*np.sin(x_t) for i in range(len(y_t)): y_t[i] = y_t[i]+0.0*np.random.randn() x5 = [] for i in range(len(x_t)): xx = [] xx.append(x_t[i]) xx.append(y_t[i]) x5.append(np.array(xx)) #makes a moon x4, y4 = make_moons(n_samples= 3*n_point_per_cluster,shuffle=True, noise=0.01, random_state= 0) x4 = [15, -15] + 5.0*x4 line_t = np.linspace(-5, 5, n_point_per_cluster) line_y1 = [] m = 0.0 c = -13.0 line_noise = 0.5 for i in range(len(line_t)): line_y1.append(m*line_t[i] + line_noise*np.random.randn() + c) line1 = [] for i in range(len(line_y1)): xx = []
plt.xlim(xx1.min(), xx1.max()) plt.ylim(xx2.min(), xx2.max()) for index, cl in enumerate(np.unique(y)): plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1], alpha=0.8, c=colors[index], marker=markers[index], label=cl, edgecolor='black') (X, y) = make_blobs(n_samples=1000, n_features=2, centers=2, cluster_std=1.05) (X, y) = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=3) (X, y) = make_moons(n_samples=1000) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) svm = SVC(kernel='rbf', C=1.0, gamma=0.5) svm.fit(X_train_std, y_train) plot_decision_regions(X_test_std, y_test, svm) plt.show()
算法流程: a. 如果一个样本点的 - 邻域包含多于m个对象,则创建一个p作为核心对象的新簇。 b. 寻找核心对象的直接密度可达的对象,被合并为一个新的簇。 c. 直到没有点可以更新簇时算法结束。 注意:非核心对象是没有直接密度可达的对象的,它们一般构成了簇的边缘。每个簇可包含多个核心对象。 """ from sklearn.datasets.samples_generator import make_moons from sklearn.cluster import KMeans from sklearn.cluster import DBSCAN from sklearn.datasets.samples_generator import make_circles import matplotlib.pyplot as plt import time # 月亮形展示 x, y_true = make_moons(n_samples=1000, noise=0.15) plt.scatter(x[:, 0], x[:, 1], c=y_true) plt.show() # KMeans start = time.time() kmeans = KMeans(init='k-means++', n_clusters=2, random_state=8).fit(x) end = time.time() interval = end - start plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_) plt.title('time: %f' % interval) plt.show() # DBSCAN start = time.time() dbscan = DBSCAN(eps=.1, min_samples=6).fit(x)
import matplotlib.pyplot as plt import pandas as pd import numpy as np import matplotlib.cm as cm from sklearn.cluster import DBSCAN from sklearn.cluster import KMeans from sklearn.cluster import AgglomerativeClustering from sklearn.datasets.samples_generator import make_blobs, make_moons, make_circles x, y = make_moons(n_samples=200, noise=.05, random_state=0) n_clusters = 2 # klasteryzacja clusterer = KMeans(n_clusters=n_clusters, init='random', random_state=10) cluster_labels = clusterer.fit_predict(x, y) plt.figure() colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) plt.scatter(x[:, 0], x[:, 1], marker='.', s=70, lw=0, alpha=0.7, c=colors, edgecolor='k') plt.title('KMeans') plt.show() # klasteryzacja hierarchiczna linkage_list = ['single', 'average', 'complete', 'ward']
def main(selectedDataset="digits", pop_size=100, max_generations=100): # a few hard-coded values figsize = [5, 4] seed = 42 # pop_size = 300 offspring_size = 2 * pop_size # max_generations = 300 maximize = False # selectedDataset = "circles" selectedClassifiers = ["SVC"] # a list of classifiers allClassifiers = [ [RandomForestClassifier, "RandomForestClassifier", 1], [BaggingClassifier, "BaggingClassifier", 1], [SVC, "SVC", 1], [RidgeClassifier, "RidgeClassifier", 1], # [AdaBoostClassifier, "AdaBoostClassifier", 1], # [ExtraTreesClassifier, "ExtraTreesClassifier", 1], # [GradientBoostingClassifier, "GradientBoostingClassifier", 1], # [SGDClassifier, "SGDClassifier", 1], # [PassiveAggressiveClassifier, "PassiveAggressiveClassifier", 1], # [LogisticRegression, "LogisticRegression", 1], ] selectedClassifiers = [classifier[1] for classifier in allClassifiers] folder_name = datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M") + "-archetypes-" + selectedDataset + "-" + str( pop_size) if not os.path.exists(folder_name): os.makedirs(folder_name) else: sys.stderr.write("Error: folder \"" + folder_name + "\" already exists. Aborting...\n") sys.exit(0) # open the logging file logfilename = os.path.join(folder_name, 'logfile.log') logger = setup_logger('logfile_' + folder_name, logfilename) logger.info("All results will be saved in folder \"%s\"" % folder_name) # load different datasets, prepare them for use logger.info("Preparing data...") # synthetic databases centers = [[1, 1], [-1, -1], [1, -1]] blobs_X, blobs_y = make_blobs(n_samples=400, centers=centers, n_features=2, cluster_std=0.6, random_state=seed) circles_X, circles_y = make_circles(n_samples=400, noise=0.15, factor=0.4, random_state=seed) moons_X, moons_y = make_moons(n_samples=400, noise=0.2, random_state=seed) iris = datasets.load_iris() digits = datasets.load_digits() # forest_X, forest_y = loadForestCoverageType() # local function mnist_X, mnist_y = loadMNIST() # local function dataList = [ [blobs_X, blobs_y, 0, "blobs"], [circles_X, circles_y, 0, "circles"], [moons_X, moons_y, 0, "moons"], [iris.data, iris.target, 0, "iris4"], [iris.data[:, 2:4], iris.target, 0, "iris2"], [digits.data, digits.target, 0, "digits"], # [forest_X, forest_y, 0, "covtype"], [mnist_X, mnist_y, 0, "mnist"] ] # argparse; all arguments are optional parser = argparse.ArgumentParser() parser.add_argument( "--classifiers", "-c", nargs='+', help="Classifier(s) to be tested. Default: %s. Accepted values: %s" % (selectedClassifiers[0], [x[1] for x in allClassifiers])) parser.add_argument( "--dataset", "-d", help="Dataset to be tested. Default: %s. Accepted values: %s" % (selectedDataset, [x[3] for x in dataList])) parser.add_argument("--pop_size", "-p", type=int, help="EA population size. Default: %d" % pop_size) parser.add_argument("--offspring_size", "-o", type=int, help="Ea offspring size. Default: %d" % offspring_size) parser.add_argument("--max_generations", "-mg", type=int, help="Maximum number of generations. Default: %d" % max_generations) # finally, parse the arguments args = parser.parse_args() # a few checks on the (optional) inputs if args.dataset: selectedDataset = args.dataset if selectedDataset not in [x[3] for x in dataList]: logger.info( "Error: dataset \"%s\" is not an accepted value. Accepted values: %s" % (selectedDataset, [x[3] for x in dataList])) sys.exit(0) if args.classifiers != None and len(args.classifiers) > 0: selectedClassifiers = args.classifiers for c in selectedClassifiers: if c not in [x[1] for x in allClassifiers]: logger.info( "Error: classifier \"%s\" is not an accepted value. Accepted values: %s" % (c, [x[1] for x in allClassifiers])) sys.exit(0) if args.max_generations: max_generations = args.max_generations if args.pop_size: pop_size = args.pop_size if args.offspring_size: offspring_size = args.offspring_size # TODO: check that min_points < max_points and max_generations > 0 # print out the current settings logger.info("Settings of the experiment...") logger.info("Fixed random seed: %d" % (seed)) logger.info("Selected dataset: %s; Selected classifier(s): %s" % (selectedDataset, selectedClassifiers)) logger.info( "Population size in EA: %d; Offspring size: %d; Max generations: %d" % (pop_size, offspring_size, max_generations)) # create the list of classifiers classifierList = [x for x in allClassifiers if x[1] in selectedClassifiers] # pick the dataset db_index = -1 for i in range(0, len(dataList)): if dataList[i][3] == selectedDataset: db_index = i dbname = dataList[db_index][3] X, y = dataList[db_index][0], dataList[db_index][1] number_classes = np.unique(y).shape[0] logger.info("Creating train/test split...") from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed) listOfSplits = [split for split in skf.split(X, y)] trainval_index, test_index = listOfSplits[0] X_trainval, y_trainval = X[trainval_index], y[trainval_index] X_test, y_test = X[test_index], y[test_index] skf = StratifiedKFold(n_splits=3, shuffle=False, random_state=seed) listOfSplits = [split for split in skf.split(X_trainval, y_trainval)] train_index, val_index = listOfSplits[0] X_train, y_train = X_trainval[train_index], y_trainval[train_index] X_val, y_val = X_trainval[val_index], y_trainval[val_index] logger.info( "Training set: %d lines (%.2f%%); test set: %d lines (%.2f%%)" % (X_train.shape[0], (100.0 * float(X_train.shape[0] / X.shape[0])), X_test.shape[0], (100.0 * float(X_test.shape[0] / X.shape[0])))) # rescale data scaler = StandardScaler() sc = scaler.fit(X_train) X = sc.transform(X) X_trainval = sc.transform(X_trainval) X_train = sc.transform(X_train) X_val = sc.transform(X_val) X_test = sc.transform(X_test) for classifier in classifierList: classifier_name = classifier[1] # start creating folder name experiment_name = os.path.join( folder_name, datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + "-archetypes-evolution-" + dbname + "-" + classifier_name) if not os.path.exists(experiment_name): os.makedirs(experiment_name) logger.info("Classifier used: " + classifier_name) start = time.time() solutions, trainAccuracy, testAccuracy = evolveArchetypes( X, y, X_train, y_train, X_test, y_test, classifier, pop_size, offspring_size, max_generations, number_classes=number_classes, maximize=maximize, seed=seed, experiment_name=experiment_name) end = time.time() exec_time = end - start # only candidates with all classes are considered final_archive = [] for sol in solutions: c = sol.candidate c = np.array(c) y_core = c[:, -1] if len(set(y_core)) == number_classes: final_archive.append(sol) logger.info("Now saving final Pareto front in a figure...") pareto_front_x = [f.fitness[0] for f in final_archive] pareto_front_y = [f.fitness[1] for f in final_archive] figure = plt.figure(figsize=figsize) ax = figure.add_subplot(111) ax.plot(pareto_front_x, pareto_front_y, "bo-", label="Solutions in final archive") ax.set_title("Optimal solutions") ax.set_xlabel("Archetype set size") ax.set_ylabel("Error") ax.set_xlim([1, X_train.shape[0]]) ax.set_ylim([0, 0.4]) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto.pdf" % (dbname, classifier_name))) plt.close(figure) figure = plt.figure(figsize=figsize) ax = figure.add_subplot(111) ax.plot(pareto_front_x, pareto_front_y, "bo-", label="Solutions in final archive") ax.set_title("Optimal solutions") ax.set_xlabel("Archetype set size") ax.set_ylabel("Error") plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto_zoom.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto_zoom.pdf" % (dbname, classifier_name))) plt.close(figure) # initial performance X_err, testAccuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) X_err, trainAccuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_trainval, y_trainval, classifier[0], cname=classifier_name, SEED=seed) logger.info("Compute performances!") logger.info("Elapsed time (seconds): %.4f" % (exec_time)) logger.info("Initial performance: train=%.4f, test=%.4f, size: %d" % (trainAccuracy, testAccuracy, X_train.shape[0])) # best solution accuracy = [] for sol in final_archive: c = sol.candidate c = np.array(c) X_core = c[:, :-1] y_core = c[:, -1] X_err, accuracy_val, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_val, y_val, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy_train, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_train, y_train, classifier[0], cname=classifier_name, SEED=seed) accuracy.append(np.mean([accuracy_val, accuracy_train])) best_ids = np.array(np.argsort(accuracy)).astype('int')[::-1] count = 0 for i in best_ids: if count > 2: break c = final_archive[i].candidate c = np.array(c) X_core = c[:, :-1] y_core = c[:, -1] X_err, accuracy_train, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_train, y_train, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy_val, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_val, y_val, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) logger.info( "Minimal train/val error: train: %.4f, val: %.4f; test: %.4f, size: %d" % (accuracy_train, accuracy_val, accuracy, X_core.shape[0])) if False: #(dbname == "mnist" or dbname == "digits") and count == 0: if dbname == "mnist": H, W = 28, 28 if dbname == "digits": H, W = 8, 8 logger.info("Now saving figures...") # save archetypes for index in range(0, len(y_core)): image = np.reshape(X_core[index, :], (H, W)) plt.figure() plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r) plt.title('Label: %d' % (y_core[index])) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "digit_%d_idx_%d.pdf" % (y_core[index], index))) plt.savefig( os.path.join( experiment_name, "digit_%d_idx_%d.png" % (y_core[index], index))) plt.close() # save test errors e = 1 for index in range(0, len(y_test)): if fail_points[index] == True: image = np.reshape(X_test[index, :], (H, W)) plt.figure() plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r) plt.title('Label: %d - Prediction: %d' % (y_test[index], y_pred[index])) plt.savefig( os.path.join( experiment_name, "err_lab_%d_pred_%d_idx_%d.pdf" % (y_test[index], y_pred[index], e))) plt.savefig( os.path.join( experiment_name, "err_lab_%d_pred_%d_idx_%d.png" % (y_test[index], y_pred[index], e))) plt.close() e = e + 1 # plot decision boundaries if we have only 2 dimensions! if X.shape[1] == 2: cmap = ListedColormap(sns.color_palette("bright", 3).as_hex()) xx, yy = make_meshgrid(X[:, 0], X[:, 1]) figure = plt.figure(figsize=figsize) _, Z_0 = plot_contours(model, xx, yy, colors='k', alpha=0.2) # plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap, marker='s', alpha=0.4, label="train") plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap, marker='+', alpha=0.3, label="test") plt.scatter(X_core[:, 0], X_core[:, 1], c=y_core, cmap=cmap, marker='D', facecolors='none', edgecolors='none', alpha=1, label="archetypes") plt.scatter(X_err[:, 0], X_err[:, 1], marker='x', facecolors='k', edgecolors='k', alpha=1, label="errors") plt.legend() plt.title("%s - acc. %.4f" % (classifier_name, accuracy)) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_%d.png" % (dbname, classifier_name, count))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_%d.pdf" % (dbname, classifier_name, count))) plt.close(figure) if count == 0: # using all samples in the training set X_err, accuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) X_err_train, trainAccuracy, model_train, fail_points_train, y_pred_train = evaluate_core( X_trainval, y_trainval, X_trainval, y_trainval, classifier[0], cname=classifier_name, SEED=seed) figure = plt.figure(figsize=figsize) _, Z_0 = plot_contours(model, xx, yy, colors='k', alpha=0.2) plt.scatter(X_trainval[:, 0], X_trainval[:, 1], c=y_trainval, cmap=cmap, marker='s', alpha=0.4, label="train") plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap, marker='+', alpha=0.4, label="test") plt.scatter(X_err[:, 0], X_err[:, 1], marker='x', facecolors='k', edgecolors='k', alpha=1, label="errors") plt.legend() plt.title("%s - acc. %.4f" % (classifier_name, accuracy)) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_alltrain.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_alltrain.pdf" % (dbname, classifier_name))) plt.close(figure) count = count + 1 logger.handlers.pop() return
} ## Automatic cluster coloring cNorm = colors.Normalize(vmin=0, vmax=1) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=color_map) for method_name, m in methods.items(): print_title('+', '%s' % method_name) x, y = sg.make_circles(n_samples=n_samples, factor=.5, noise=.05) pred_labels = m.fit_predict(x) colors = [scalarMap.to_rgba(l) for l in pred_labels] plt.scatter(x[:, 0], x[:, 1], c=colors) plt.savefig('./plots/circle_%s.pdf' % method_name) clear_plt() x, y = sg.make_moons(n_samples=n_samples, noise=.05) pred_labels = m.fit_predict(x) colors = [scalarMap.to_rgba(l) for l in pred_labels] plt.scatter(x[:, 0], x[:, 1], c=colors) plt.savefig('./plots/moons_%s.pdf' % method_name) clear_plt() # THIRD EXPERIMENT if make_experiment[2]: print_title('=', 'THIRD EXPERIMENT') data = pd.read_csv('./data/processed.csv') x = data.drop(['num'], axis=1) experiments = { 'binary': (2, data.num.apply(lambda x: int(x != 0))), 'normal': (5, data.num) }
from sklearn.cluster import KMeans import matplotlib.pyplot as plt import pandas as pd from sklearn.datasets.samples_generator import make_moons import seaborn as sns x, y = make_moons(1000, noise=.05, random_state=0) X_moon = pd.DataFrame(x, columns=['f1', 'f2']) # metoda klasteryzacji cost_list = [] k_val = range(2, 11) for i in k_val: kmean_1 = KMeans(n_clusters=i, init='random', n_init=10, max_iter=300, tol=1e-4, random_state=None) kmean_1.fit_predict(X_moon) cost_list.append(kmean_1.inertia_) plt.figure(figsize=(6,5)) plt.plot(k_val, cost_list, '-*m') plt.grid() plt.xlabel('Number of centoids (clusters)') plt.ylabel('Cost function') km = KMeans(n_clusters=2, init='random', n_init=10, max_iter=350, tol=1e-4, random_state=None) # y_km - wynik predykcji km.fit(X_moon) y_km = km.predict(X_moon) # wykres X_moon['k_means'] = y_km sns.lmplot(data=X_moon, x='f1', y='f2', fit_reg=False, legend=False, hue='k_means', palette=['#eb6c6a', '#6aeb6c']).set(
plt.ion() # linearly separable with two classes plt.axis([-15, 5, -12, 12]) points, clusters = make_blobs(n_samples=100, centers=2, n_features=2, cluster_std=2.5, random_state=1) plot(points, clusters) # linearly separable with three classes points, clusters = make_blobs(n_samples=150, centers=[[1, 1], [-1, -1], [1, -2]], n_features=2, cluster_std=0.3, random_state=123) plot(points, clusters) # non-linearly separable points, clusters = make_blobs(n_samples=100, centers=2, n_features=2, cluster_std=3.8, random_state=1) plot(points, clusters) # very non-linearly separable points, clusters = make_moons(n_samples=100, noise=0.1, random_state=1) plot(points, clusters)
plt.xlim(X1.min(), X1.max()) plt.ylim(X2.min(), X2.max()) for i, j in enumerate(np.unique(y_set)): plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], c = ListedColormap(('red', 'blue'))(i), label = j) plt.title(text) plt.xlabel('X') plt.ylabel('y') plt.legend() plt.show() """## Make weird datasets to throw our models off""" from sklearn.datasets.samples_generator import make_moons # generate 2d classification dataset X, y = make_moons(n_samples=1000, noise=0.3) df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) colors = {0:'red', 1:'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) plt.show() datadict = {'X1': X[:,0],'X2' : X[:,1], 'target': y} data = pd.DataFrame(data=datadict) X = data.iloc[:, [0,1]].values y = data.iloc[:, 2].values # TRAIN TEST SPLIT from sklearn.model_selection import train_test_split
# 加入噪声数据 rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels']) df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_classification') plt.show() from sklearn.datasets.samples_generator import make_circles X,labels=make_circles(n_samples=200,noise=0.2,factor=0.2) df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels']) df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']), cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_circles') plt.show() from matplotlib import pyplot as plt from sklearn.datasets.samples_generator import make_moons x1,y1=make_moons(n_samples=1000,noise=0.1) plt.title('make_moons function example') plt.scatter(x1[:,0],x1[:,1],marker='o',c=y1) plt.show() from sklearn.datasets.samples_generator import make_regression X,Y,coef = make_regression(n_samples=100, n_features=1, bias=5, tail_strength= 0, noise= 1, shuffle=True, coef=True, random_state=None) print(coef) #49.08950060982939 df = pd.DataFrame(np.c_[X,Y],columns = ['x','y']) df.plot('x','y',kind = 'scatter',s = 50,c = 'm',edgecolor = 'k') plt.show()
my_members = k_means_3_labels == order[k] plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20) cluster_center = k_means_3_cluster_centres[order[k]] plt.scatter(cluster_center[0], cluster_center[1], marker='o', c=col, s=200, alpha=0.8) plt.axis('equal') plt.title('KMeans 3') ''' #2: NON-SPHERICAL SHAPES ''' [X, true_labels] = make_moons(n_samples=1000, noise=.05) plt.figure(figsize=(12, 6)) plt.suptitle('Non-Spherical Shapes', fontsize=15) plt.subplot(121) for k, col in zip(range(2), colors): my_members = true_labels == k plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20) plt.axis('equal') plt.title('Original Data') # Compute clustering with 2 Clusters k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10) k_means_2.fit(X) k_means_2_labels = k_means_2.labels_
from sklearn.datasets.samples_generator import make_moons import numpy as np import matplotlib.pyplot as plt from sklearn.cluster import KMeans X, Y = make_moons(n_samples=400, noise=0.05, random_state=0) # print(X) plt.scatter(X[:, 0], X[:, 1]) plt.savefig('scatterplot.png') kmeans = KMeans(n_clusters=2) kmeans.fit(X) kmeans_results = kmeans.predict(X) print(kmeans_results) plt.scatter(X[:, 0], X[:, 1], c=kmeans_results) plt.savefig('scatterplot_color.png')