def generate_data(choice='linear'): global X, Y, N if choice == 'linear': N = np.random.randint(50, 150) std = (np.random.randint(6, 10)) * 0.1 X, Y = make_blobs(n_samples=N, centers=2, random_state=0, cluster_std=0.60) elif choice == 'circle': N = np.random.randint(100, 200) factor = (np.random.randint(1, 6)) * 0.1 X, Y = make_circles(n_samples=N, factor=factor, noise=0.1) plt.delaxes() plt.scatter(X[:, 0], X[:, 1], c=Y, cmap='winter', s=100, edgecolors='black') plt.xlabel("Feature1") plt.ylabel("Feature2") plt.title("DATA POINTS") gendat = BytesIO() plt.savefig(gendat, format='png') # figfile.seek(0) # Rewind to the beginning of the file gendat_img = base64.b64encode(gendat.getbuffer()).decode('ascii') return gendat_img
def circlesexample(self): X, y = make_circles(90, factor=0.2, noise=0.1) r = np.exp(-(X**2).sum(1)) zaxis = [0.2, 0.4, 0.6, 0.8, 1.0] zaxislabel = [r'0.2', r'0.4', r'0.6', r'0.8', r'1.0'] self.ax.scatter(X[:, 0], X[:, 1], r, c=y, s=70, cmap='seismic') self.ax.view_init(elev=90, azim=90) self.ax.set_xlabel('X', color='w', fontproperties=self.prop, fontsize=60) self.ax.set_ylabel('Y', color='w', fontproperties=self.prop, fontsize=60) self.ax.set_zlabel('Z', labelpad=-1, color='red', fontproperties=self.prop, fontsize=60) self.ax.set_zticklabels(zaxislabel, fontsize=7, color='none') # self.ax.set_zticks([], False) self.ax.set_zticks(zaxis) plt.xticks(ticks=np.arange(-1.2, 1.4, .2), labels='') plt.yticks(ticks=np.arange(-1.2, 1.4, .2), labels='') self.ax.grid(linewidth=20) return self.fig,
def __init__(self): self.start_time = time.time() if (Configuration.data == "Social_Network_Ads.csv"): self.dataset = pd.read_csv(str(Configuration.data)) if (Configuration.algorithm == "linear_regression"): self.X = self.dataset.iloc[:, :-1].values self.y = self.dataset.iloc[:, 1].values elif (Configuration.algorithm == "logistic_regression" or Configuration.algorithm == "svc" or Configuration.algorithm == "decision_tree_classification" or Configuration.algorithm == "random_forest_classification" or Configuration.algorithm == "knn"): if (Configuration.data=="Social_Network_Ads.csv"): self.X = self.dataset.iloc[:, [2,3]].values self.y = self.dataset.iloc[:, 4].values else: if (Configuration.data == "moons"): from sklearn.datasets.samples_generator import make_moons self.X, self.y = make_moons(100, noise=.2, random_state = 0) elif (Configuration.data == "circles"): from sklearn.datasets.samples_generator import make_circles self.X, self.y = make_circles(100, factor=.5, noise=.1, random_state = 0) elif (Configuration.algorithm == "polynomial_regression"): self.X = self.dataset.iloc[:, 1:2].values self.y = self.dataset.iloc[:, 2].values elif (Configuration.algorithm == "kmeans"): self.X = self.dataset.iloc[:, [3, 4]].values self.y = None if (Configuration.data == "Social_Network_Ads.csv"): self.directory = "SocialNetworkAds" elif (Configuration.data == "moons"): self.directory = "Moons" elif (Configuration.data == "circles"): self.directory = "Circles"
def ex2(): X, y = make_circles(100, factor=.1, noise=.1) clf = SVC(kernel='linear').fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='summer') plt.show() plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='summer') plot_svc_decision_function(clf); plt.show() r = np.exp(-(X[:, 0] ** 2 + X[:, 1] ** 2)) ax = plt.subplot(projection='3d') ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='summer') ax.view_init(elev=30, azim=30) ax.set_xlabel('x') ax.set_ylabel('y') ax.set_zlabel('r') plt.show() clf = SVC(kernel='rbf') clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='summer') plot_svc_decision_function(clf) plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=200, facecolors='none'); plt.show() pass
def make_circles(self, ax=None, plot=False): if ax is None: fig, ax = plt.subplots(1, 1, figsize=(12, 6)) X, y = make_circles(100, factor=.1, noise=.1) if plot: mask = y > 0 ax.scatter(X[mask, 0], X[mask, 1], c=self.val_to_color(y[mask]), s=50, label="Positive") ax.scatter(X[~mask, 0], X[~mask, 1], c=self.val_to_color(y[~mask]), s=50, label="Negative") # plt.scatter(X[:, 0], X[:, 1], c=self.val_to_color(y), s=50, cmap='autumn') ax.set_xlabel("$x_1$") ax.set_ylabel("$x_2$") ax.legend() return X, y
def main(_): cut = int(FLAGS.n_samples * 0.7) start = time.time() data, features = make_circles(n_samples=FLAGS.n_samples, shuffle=True, noise=0.12, factor=0.4) tr_data, tr_features = data[:cut], features[:cut] te_data, te_features = data[cut:], features[cut:] test = [] fig, ax = plt.subplots() ax.scatter(tr_data[:, 0], tr_data[:, 1], marker='o', s=100, c=tr_features, cmap=plt.cm.coolwarm) plt.plot() plt.show() with tf.Session() as sess: for i, j in zip(te_data, te_features): distances = tf.reduce_sum(tf.square(tf.subtract(i, tr_data)), axis=1) neighbor = tf.arg_min(distances, 0) test.append(tr_features[sess.run(neighbor)]) fig, ax = plt.subplots() ax.scatter(te_data[:, 0], te_data[:, 1], marker='o', s=100, c=test, cmap=plt.cm.coolwarm) plt.plot() plt.show() end = time.time() print("Found in %.2f seconds" % (end-start)) print("Cluster assignments:", test)
def kernel_model_rbf(): x, y = make_circles(100, factor=.1, noise=.1) clf = SVC(kernel='rbf', C=1E6).fit(x, y) plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='autumn') plot_svc_decision_function(clf, plot_support=False) plot_3d(x, y) plt.show()
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"): # generate 2d classification dataset if (type_data == "blobs"): X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features) elif(type_data == "moons"): X, y = make_moons(n_samples=n_samples, noise=0.1) elif(type_data == "circles"): X, y = make_circles(n_samples=n_samples, noise=0.05) # scatter plot, dots colored by class value # df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) # colors = {0:'red', 1:'blue', 2:'green'} # fig, ax = pyplot.subplots() # grouped = df.groupby('label') # for key, group in grouped: # group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) # pyplot.show() X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None) classes = np.unique(y_train) if(0): enc = OneHotEncoder().fit(classes.reshape(-1,1)) y_train = enc.transform(y_train.reshape(-1, 1)) print (y_test) y_test = enc.transform(y_test.reshape(-1, 1)) print (y_test) y_train = one_hot_encode(y_train, classes) y_test = one_hot_encode(y_test, classes) return X_train, y_train, X_test, y_test, classes
def _get_circles(*args, **kwargs): X, y = make_circles(n_samples=100, noise=0.1) metadata = { 'regression': False, 'scoring': classifier_scoring, 'primary_metric': 'accuracy', } return X, y, metadata
def make_circles(self, plot=False): X, y = make_circles(100, factor=.1, noise=.1) if plot: plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') plt.xlabel("$x_1$") plt.ylabel("$x_2$") return X, y
def generate_circle_data(N=100, seed=1): np.random.seed(seed) X, y = make_circles(N, factor=.1, noise=.1) y[y == 0] = -1 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) return X_train.tolist(), X_test.tolist(), y_train.tolist(), y_test.tolist()
def kernel_model_linear(): """ 线性核函数 :return: """ x, y = make_circles(100, factor=.1, noise=.1) clf = SVC(kernel='linear').fit(x, y) plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap='autumn') plot_svc_decision_function(clf, plot_support=False) plot_3d(x, y) plt.show()
def spectral_clustering(g_directed): # X=np.array.g_directed.edges() # W = pairwise_distances(X, metric="euclidean") # vectorizer = np.vectorize(lambda x: 1 if x < 5 else 0) # W = np.vectorize(vectorizer)(W) # print(W) W = nx.adjacency_matrix(g_directed) print(W.todense()) D = np.diag(np.sum(np.array(W.todense()), axis=1)) print('degree matrix:') print(D) L = D - W print('laplacian matrix:') print(L) e, v = np.linalg.eig(L) # eigenvalues print('eigenvalues:') print(e) # eigenvectors print('eigenvectors:') print(v) i = np.where(e < 0.5)[0] x = 1 U = np.array(v[:, i[1]]) km = KMeans(init='k-means++', n_clusters=3) km.fit(U) km.labels_ X, clusters = make_circles(n_samples=1000, noise=.05, factor=.5, random_state=0) plt.scatter(X[:, 0], X[:, 1]) # Using K-means km = KMeans(init='k-means++', n_clusters=2) km_clustering = km.fit(X) plt.scatter(X[:, 0], X[:, 1], c=km_clustering.labels_, cmap='rainbow', alpha=0.7, edgecolors='b') # Using Spectral Clustering scitkit-learn’s implementation sc = SpectralClustering(n_clusters=2, affinity='nearest_neighbors', random_state=0) sc_clustering = sc.fit(X) plt.scatter(X[:, 0], X[:, 1], c=sc_clustering.labels_, cmap='rainbow', alpha=0.7, edgecolors='b')
def nonLinearSeparable(): from sklearn.datasets.samples_generator import make_circles X, y = make_circles(100, factor=0.1, noise=.1) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') plt.show() from sklearn import svm clf = svm.SVC(kernel='rbf', C=1E6) clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') plot_decision_function(clf)
def get_dataset(self): if self.type == 'moon': datas, labels = make_moons(n_samples=self.n_samples, noise=self.noise) elif self.type == 'circle': datas, labels = make_circles(n_samples=self.n_samples, noise=self.noise) else: print('wrong dataset type input.') dataset = {} dataset['data'] = datas dataset['target'] = labels return dataset
def train_svm_plus(): # 二维圆形数据 factor 内外圆比例(0, 1) X, y = make_circles(100, factor=0.1, noise=0.1) # 加入径向基函数 clf = SVC(kernel='rbf') clf.fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') plot_SVC_decision_function(clf, plot_support=False) plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=300, lw=1, facecolors='none') return X, y
def test_mapper_filterer(): data, labels = make_circles(n_samples=2000, noise=0.03, factor=0.3) params = { 'coverer__intervals': 10, 'coverer__overlap': 0.1, 'clusterer__min_samples': 3, 'clusterer__eps': 0.5 } m_filter = Mapper(filterer=MinMaxScaler(), params=params) m_nofilter = Mapper(filterer=MinMaxScaler(), params=params) scaled_data = MinMaxScaler().fit_transform(data) m_filter.fit(data) m_nofilter.fit(data, scaled_data) assert_true(m_filter.links_ == m_nofilter.links_) assert_true(len(m_filter.nodes_) == len(m_nofilter.nodes_))
def single_cluster(dim, samples, std, clusters, cluster_func, x=None, y=None): # x, y = make_blobs(n_samples=samples, centers=clusters, n_features=dim, random_state=0, cluster_std=std) if x is None or y is None: x, y = samples_generator.make_circles(n_samples=samples, random_state=True, factor=0.3, noise=0.05) _y = cluster_func(x, clusters) acc = sklearn.metrics.homogeneity_score(y, _y) hyp.plot(x, '.', group=y, save_path='or.png') hyp.plot(x, '.', group=_y, save_path='grp.png') print('Accuracy {0:0.2f}'.format(acc))
def test_graph_simple(): data, labels = make_circles(n_samples=2000, noise=0.03, factor=0.3) params = { 'coverer__intervals': 10, 'coverer__overlap': 0.1, 'clusterer__min_samples': 3, 'clusterer__eps': 0.5 } m = Mapper(params=params) scaled_data = MinMaxScaler().fit_transform(data) m.fit(data, scaled_data) categories = {"labels": labels} scales = {"y[0]": scaled_data[:, 0], "y[1]": scaled_data[:, 1]} json_graph_str = json_graph(m, categories, scales) # check if it can be loaded to validate html json_graph_dict = json.loads(json_graph_str) html_graph_str = html_graph(m, categories, scales) # validate HTML?
def choose_dataset(chosen_dataset, n_points): X = None if chosen_dataset == "blobs": X = make_blobs(n_samples=n_points, centers=4, n_features=2, cluster_std=1.5, random_state=42)[0] elif chosen_dataset == "moons": X = make_moons(n_samples=n_points, noise=0.05, random_state=42)[0] elif chosen_dataset == "scatter": X = make_blobs(n_samples=n_points, cluster_std=[2.5, 2.5, 2.5], random_state=42)[0] elif chosen_dataset == "circle": X = make_circles(n_samples=n_points, noise=0, random_state=42)[0] return X
def main(): # 过滤警告 warnings.filterwarnings("ignore") # 创建“点滴”数据 # x, y = samples_generator.make_blobs(n_samples=200, centers=2, cluster_std=1, random_state=0) # 创建“月牙”数据 # x, y = samples_generator.make_moons(n_samples=200, noise=0.05, random_state=0) # 创建“环形”数据 x, y = samples_generator.make_circles(n_samples=200, noise=0.05, random_state=0, factor=0.4) """ 创建七种聚类算法 """ # clusters = cluster.KMeans(2) # K-means++ # clusters = cluster.MeanShift() # 均值迁移 # clusters = cluster. AgglomerativeClustering(2) # 层聚类 # clusters = cluster.AffinityPropagation() # AP聚类 # clusters = cluster.SpectralClustering(n_clusters=2, affinity="nearest_neighbors") # 谱聚类 # clusters = cluster.DBSCAN(eps=0.55, min_samples=5) # 密度聚类 clusters = GaussianMixture(n_components=2) # 高斯分布 # 拟合 _x = clusters.fit_predict(x) """ 三种评价方法 """ # 1.轮廓系数 print(metrics.silhouette_score(x, _x)) # 2.CH指数 print(metrics.calinski_harabasz_score(x, _x)) # 3.戴维森堡丁指数 print(metrics.davies_bouldin_score(x, _x)) # 绘图 plt.scatter(x[:, 0], x[:, 1], c=_x, cmap='viridis') plt.show()
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_circles, make_blobs from sklearn.cluster import DBSCAN from sklearn.metrics import calinski_harabaz_score # Create cluster dataset X1, y1 = make_circles(n_samples=5000, factor=0.6, noise=0.05, random_state=666) X2, y2 = make_blobs(n_samples=1000, n_features=2, centers=[[1.2, 1.2]], \ cluster_std=[[0.1]], random_state=666) X = np.concatenate((X1, X2)) y = np.concatenate((y1, y2)) # Create cluster model dbscan = DBSCAN(eps=0.1, min_samples=5) y_predict = dbscan.fit_predict(X) print('Calinski-Harabasz Index Score: ', calinski_harabaz_score(X, y_predict)) # Visualization plt.scatter(X[:, 0], X[:, 1], c=y_predict, marker='o', edgecolors='black') plt.xlabel('X') plt.ylabel('y') plt.title('DBSCAN Cluster Algorithm') plt.show()
import time import matplotlib import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_circles N = 210 K = 2 MAX_ITERS = 1000 cut = int(N * 0.7) start = time.time() data, features = make_circles(n_samples=N, shuffle=True, noise=0.12, factor=0.4) # print(data.shape) # print(data) # print(features.shape) tr_data, tr_features = data[:cut], features[:cut] te_data, te_features = data[cut:], features[cut:] test = [] fig, ax = plt.subplots() ax.scatter(tr_data.transpose()[0], tr_data.transpose()[1], marker='o', s=100, c=tr_features,
import matplotlib import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_blobs from sklearn.datasets.samples_generator import make_circles DATA_TYPE = 'blobs' # Number of clusters, if we choose circles, only 2 will be enough if (DATA_TYPE == 'circle'): K = 2 else: K = 4 # Maximum number of iterations, if the conditions are not met MAX_ITERS = 1000 N = 200 start = time.time() centers = [(-2, -2), (-2, 1.5), (1.5, -2), (2, 1.5)] if (DATA_TYPE == 'circle'): data, features = make_circles(n_samples=200, shuffle=True, noise=0.01, factor=0.4) else: data, features = make_blobs(n_samples=200, centers=centers, n_features=2, cluster_std=0.8, shuffle=False, random_state=42) fig, ax = plt.subplots() ax.scatter(np.asarray(centers).transpose()[0], np.asarray(centers).transpose()[1], marker = 'o', s = 250) plt.show() print("TEST")
def main(selectedDataset="digits", pop_size=100, max_generations=100): # a few hard-coded values figsize = [5, 4] seed = 42 # pop_size = 300 offspring_size = 2 * pop_size # max_generations = 300 maximize = False # selectedDataset = "circles" selectedClassifiers = ["SVC"] # a list of classifiers allClassifiers = [ [RandomForestClassifier, "RandomForestClassifier", 1], [BaggingClassifier, "BaggingClassifier", 1], [SVC, "SVC", 1], [RidgeClassifier, "RidgeClassifier", 1], # [AdaBoostClassifier, "AdaBoostClassifier", 1], # [ExtraTreesClassifier, "ExtraTreesClassifier", 1], # [GradientBoostingClassifier, "GradientBoostingClassifier", 1], # [SGDClassifier, "SGDClassifier", 1], # [PassiveAggressiveClassifier, "PassiveAggressiveClassifier", 1], # [LogisticRegression, "LogisticRegression", 1], ] selectedClassifiers = [classifier[1] for classifier in allClassifiers] folder_name = datetime.datetime.now().strftime( "%Y-%m-%d-%H-%M") + "-archetypes-" + selectedDataset + "-" + str( pop_size) if not os.path.exists(folder_name): os.makedirs(folder_name) else: sys.stderr.write("Error: folder \"" + folder_name + "\" already exists. Aborting...\n") sys.exit(0) # open the logging file logfilename = os.path.join(folder_name, 'logfile.log') logger = setup_logger('logfile_' + folder_name, logfilename) logger.info("All results will be saved in folder \"%s\"" % folder_name) # load different datasets, prepare them for use logger.info("Preparing data...") # synthetic databases centers = [[1, 1], [-1, -1], [1, -1]] blobs_X, blobs_y = make_blobs(n_samples=400, centers=centers, n_features=2, cluster_std=0.6, random_state=seed) circles_X, circles_y = make_circles(n_samples=400, noise=0.15, factor=0.4, random_state=seed) moons_X, moons_y = make_moons(n_samples=400, noise=0.2, random_state=seed) iris = datasets.load_iris() digits = datasets.load_digits() # forest_X, forest_y = loadForestCoverageType() # local function mnist_X, mnist_y = loadMNIST() # local function dataList = [ [blobs_X, blobs_y, 0, "blobs"], [circles_X, circles_y, 0, "circles"], [moons_X, moons_y, 0, "moons"], [iris.data, iris.target, 0, "iris4"], [iris.data[:, 2:4], iris.target, 0, "iris2"], [digits.data, digits.target, 0, "digits"], # [forest_X, forest_y, 0, "covtype"], [mnist_X, mnist_y, 0, "mnist"] ] # argparse; all arguments are optional parser = argparse.ArgumentParser() parser.add_argument( "--classifiers", "-c", nargs='+', help="Classifier(s) to be tested. Default: %s. Accepted values: %s" % (selectedClassifiers[0], [x[1] for x in allClassifiers])) parser.add_argument( "--dataset", "-d", help="Dataset to be tested. Default: %s. Accepted values: %s" % (selectedDataset, [x[3] for x in dataList])) parser.add_argument("--pop_size", "-p", type=int, help="EA population size. Default: %d" % pop_size) parser.add_argument("--offspring_size", "-o", type=int, help="Ea offspring size. Default: %d" % offspring_size) parser.add_argument("--max_generations", "-mg", type=int, help="Maximum number of generations. Default: %d" % max_generations) # finally, parse the arguments args = parser.parse_args() # a few checks on the (optional) inputs if args.dataset: selectedDataset = args.dataset if selectedDataset not in [x[3] for x in dataList]: logger.info( "Error: dataset \"%s\" is not an accepted value. Accepted values: %s" % (selectedDataset, [x[3] for x in dataList])) sys.exit(0) if args.classifiers != None and len(args.classifiers) > 0: selectedClassifiers = args.classifiers for c in selectedClassifiers: if c not in [x[1] for x in allClassifiers]: logger.info( "Error: classifier \"%s\" is not an accepted value. Accepted values: %s" % (c, [x[1] for x in allClassifiers])) sys.exit(0) if args.max_generations: max_generations = args.max_generations if args.pop_size: pop_size = args.pop_size if args.offspring_size: offspring_size = args.offspring_size # TODO: check that min_points < max_points and max_generations > 0 # print out the current settings logger.info("Settings of the experiment...") logger.info("Fixed random seed: %d" % (seed)) logger.info("Selected dataset: %s; Selected classifier(s): %s" % (selectedDataset, selectedClassifiers)) logger.info( "Population size in EA: %d; Offspring size: %d; Max generations: %d" % (pop_size, offspring_size, max_generations)) # create the list of classifiers classifierList = [x for x in allClassifiers if x[1] in selectedClassifiers] # pick the dataset db_index = -1 for i in range(0, len(dataList)): if dataList[i][3] == selectedDataset: db_index = i dbname = dataList[db_index][3] X, y = dataList[db_index][0], dataList[db_index][1] number_classes = np.unique(y).shape[0] logger.info("Creating train/test split...") from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed) listOfSplits = [split for split in skf.split(X, y)] trainval_index, test_index = listOfSplits[0] X_trainval, y_trainval = X[trainval_index], y[trainval_index] X_test, y_test = X[test_index], y[test_index] skf = StratifiedKFold(n_splits=3, shuffle=False, random_state=seed) listOfSplits = [split for split in skf.split(X_trainval, y_trainval)] train_index, val_index = listOfSplits[0] X_train, y_train = X_trainval[train_index], y_trainval[train_index] X_val, y_val = X_trainval[val_index], y_trainval[val_index] logger.info( "Training set: %d lines (%.2f%%); test set: %d lines (%.2f%%)" % (X_train.shape[0], (100.0 * float(X_train.shape[0] / X.shape[0])), X_test.shape[0], (100.0 * float(X_test.shape[0] / X.shape[0])))) # rescale data scaler = StandardScaler() sc = scaler.fit(X_train) X = sc.transform(X) X_trainval = sc.transform(X_trainval) X_train = sc.transform(X_train) X_val = sc.transform(X_val) X_test = sc.transform(X_test) for classifier in classifierList: classifier_name = classifier[1] # start creating folder name experiment_name = os.path.join( folder_name, datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") + "-archetypes-evolution-" + dbname + "-" + classifier_name) if not os.path.exists(experiment_name): os.makedirs(experiment_name) logger.info("Classifier used: " + classifier_name) start = time.time() solutions, trainAccuracy, testAccuracy = evolveArchetypes( X, y, X_train, y_train, X_test, y_test, classifier, pop_size, offspring_size, max_generations, number_classes=number_classes, maximize=maximize, seed=seed, experiment_name=experiment_name) end = time.time() exec_time = end - start # only candidates with all classes are considered final_archive = [] for sol in solutions: c = sol.candidate c = np.array(c) y_core = c[:, -1] if len(set(y_core)) == number_classes: final_archive.append(sol) logger.info("Now saving final Pareto front in a figure...") pareto_front_x = [f.fitness[0] for f in final_archive] pareto_front_y = [f.fitness[1] for f in final_archive] figure = plt.figure(figsize=figsize) ax = figure.add_subplot(111) ax.plot(pareto_front_x, pareto_front_y, "bo-", label="Solutions in final archive") ax.set_title("Optimal solutions") ax.set_xlabel("Archetype set size") ax.set_ylabel("Error") ax.set_xlim([1, X_train.shape[0]]) ax.set_ylim([0, 0.4]) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto.pdf" % (dbname, classifier_name))) plt.close(figure) figure = plt.figure(figsize=figsize) ax = figure.add_subplot(111) ax.plot(pareto_front_x, pareto_front_y, "bo-", label="Solutions in final archive") ax.set_title("Optimal solutions") ax.set_xlabel("Archetype set size") ax.set_ylabel("Error") plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto_zoom.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_pareto_zoom.pdf" % (dbname, classifier_name))) plt.close(figure) # initial performance X_err, testAccuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) X_err, trainAccuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_trainval, y_trainval, classifier[0], cname=classifier_name, SEED=seed) logger.info("Compute performances!") logger.info("Elapsed time (seconds): %.4f" % (exec_time)) logger.info("Initial performance: train=%.4f, test=%.4f, size: %d" % (trainAccuracy, testAccuracy, X_train.shape[0])) # best solution accuracy = [] for sol in final_archive: c = sol.candidate c = np.array(c) X_core = c[:, :-1] y_core = c[:, -1] X_err, accuracy_val, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_val, y_val, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy_train, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_train, y_train, classifier[0], cname=classifier_name, SEED=seed) accuracy.append(np.mean([accuracy_val, accuracy_train])) best_ids = np.array(np.argsort(accuracy)).astype('int')[::-1] count = 0 for i in best_ids: if count > 2: break c = final_archive[i].candidate c = np.array(c) X_core = c[:, :-1] y_core = c[:, -1] X_err, accuracy_train, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_train, y_train, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy_val, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_val, y_val, classifier[0], cname=classifier_name, SEED=seed) X_err, accuracy, model, fail_points, y_pred = evaluate_core( X_core, y_core, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) logger.info( "Minimal train/val error: train: %.4f, val: %.4f; test: %.4f, size: %d" % (accuracy_train, accuracy_val, accuracy, X_core.shape[0])) if False: #(dbname == "mnist" or dbname == "digits") and count == 0: if dbname == "mnist": H, W = 28, 28 if dbname == "digits": H, W = 8, 8 logger.info("Now saving figures...") # save archetypes for index in range(0, len(y_core)): image = np.reshape(X_core[index, :], (H, W)) plt.figure() plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r) plt.title('Label: %d' % (y_core[index])) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "digit_%d_idx_%d.pdf" % (y_core[index], index))) plt.savefig( os.path.join( experiment_name, "digit_%d_idx_%d.png" % (y_core[index], index))) plt.close() # save test errors e = 1 for index in range(0, len(y_test)): if fail_points[index] == True: image = np.reshape(X_test[index, :], (H, W)) plt.figure() plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r) plt.title('Label: %d - Prediction: %d' % (y_test[index], y_pred[index])) plt.savefig( os.path.join( experiment_name, "err_lab_%d_pred_%d_idx_%d.pdf" % (y_test[index], y_pred[index], e))) plt.savefig( os.path.join( experiment_name, "err_lab_%d_pred_%d_idx_%d.png" % (y_test[index], y_pred[index], e))) plt.close() e = e + 1 # plot decision boundaries if we have only 2 dimensions! if X.shape[1] == 2: cmap = ListedColormap(sns.color_palette("bright", 3).as_hex()) xx, yy = make_meshgrid(X[:, 0], X[:, 1]) figure = plt.figure(figsize=figsize) _, Z_0 = plot_contours(model, xx, yy, colors='k', alpha=0.2) # plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap, marker='s', alpha=0.4, label="train") plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap, marker='+', alpha=0.3, label="test") plt.scatter(X_core[:, 0], X_core[:, 1], c=y_core, cmap=cmap, marker='D', facecolors='none', edgecolors='none', alpha=1, label="archetypes") plt.scatter(X_err[:, 0], X_err[:, 1], marker='x', facecolors='k', edgecolors='k', alpha=1, label="errors") plt.legend() plt.title("%s - acc. %.4f" % (classifier_name, accuracy)) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_%d.png" % (dbname, classifier_name, count))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_%d.pdf" % (dbname, classifier_name, count))) plt.close(figure) if count == 0: # using all samples in the training set X_err, accuracy, model, fail_points, y_pred = evaluate_core( X_trainval, y_trainval, X_test, y_test, classifier[0], cname=classifier_name, SEED=seed) X_err_train, trainAccuracy, model_train, fail_points_train, y_pred_train = evaluate_core( X_trainval, y_trainval, X_trainval, y_trainval, classifier[0], cname=classifier_name, SEED=seed) figure = plt.figure(figsize=figsize) _, Z_0 = plot_contours(model, xx, yy, colors='k', alpha=0.2) plt.scatter(X_trainval[:, 0], X_trainval[:, 1], c=y_trainval, cmap=cmap, marker='s', alpha=0.4, label="train") plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap, marker='+', alpha=0.4, label="test") plt.scatter(X_err[:, 0], X_err[:, 1], marker='x', facecolors='k', edgecolors='k', alpha=1, label="errors") plt.legend() plt.title("%s - acc. %.4f" % (classifier_name, accuracy)) plt.tight_layout() plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_alltrain.png" % (dbname, classifier_name))) plt.savefig( os.path.join( experiment_name, "%s_EvoArch_%s_alltrain.pdf" % (dbname, classifier_name))) plt.close(figure) count = count + 1 logger.handlers.pop() return
if q in oldCoreObjs.keys(): delte = [val for val in oldCoreObjs[q] if val in notAccess] #Δ = N(q)∩Γ queue.extend(delte) #将Δ中的样本加入队列Q notAccess = [val for val in notAccess if val not in delte] #Γ = Γ\Δ k += 1 C[k] = [val for val in OldNotAccess if val not in notAccess] for x in C[k]: if x in coreObjs.keys(): del coreObjs[x] return C if __name__ == '__main__': X, y_true = make_circles(n_samples=1000, noise=0.15) # 随机生成1000个圆环形状数据 print(X) print(y_true) plt.scatter(X[:, 0], X[:, 1], c=y_true) plt.show() # DBSCAN 算法 t0 = time.time() y_pred = DBSCAN(eps=.1, min_samples=6).fit_predict(X) # 该算法对应的两个参数 t = time.time() - t0 plt.scatter(X[:, 0], X[:, 1], c=y_pred) plt.title('time : %f' % t) plt.show() # eps为距离阈值ϵ,min_samples 为邻域样本数阈值MinPts, X为数据
ax = ax or plt.gca() ax.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') ax.set_xlim(-1, 4) ax.set_ylim(-1, 6) plot_svc_decision_function(model, ax) fig, ax = plt.subplots(1, 2, figsize=(16, 6)) fig.subplots_adjust(left=0.0625, right=0.95, wspace=0.1) for axi, N in zip(ax, [60, 120]): plot_svm(N, axi) axi.set_title('N = {0}'.format(N)) # 引入核函数的SVM from sklearn.datasets.samples_generator import make_circles X, y = make_circles(100, factor=.1, noise=.1) clf = SVC(kernel='linear').fit(X, y) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') plot_svc_decision_function(clf, plot_support=False) plt.show() #加入了新的维度r from mpl_toolkits import mplot3d r = np.exp(-(X**2).sum(1)) def plot_3D(elev=30, azim=30, X=X, y=y): ax = plt.subplot(projection='3d') ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='autumn')
ylim = ax.get_ylim() xx = np.linspace(xlim[0], xlim[1], 200) yy = np.linspace(ylim[0], ylim[1], 200) YY, XX = np.meshgrid(yy, xx) xy = np.vstack([XX.ravel(), YY.ravel()]).T Z = model.decision_function(xy).reshape(XX.shape) # plot decision boundary and margins ax.contour(XX, YY, Z, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) # plot support vectors ax.scatter(model.support_vectors_[:, 0], model.support_vectors_[:, 1], s=300, linewidth=1, facecolors='none', edgecolors='k') plt.show() y1_model = model.predict(X_train) y2_model = model.predict(X_test) print('Accuracy on train data:',accuracy_score(Y_train, y1_model)) print('Accuracy on test data:',accuracy_score(Y_test, y2_model)) X,Y=make_circles(n_samples=200,noise=0.05,factor=0.5) X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3) svclassifier = svm.SVC(kernel='linear',C=1E3) svclassifier.fit(X_train, Y_train) plot_model(svclassifier) print('accuracy test = ' ,accuracy_score(Y_test , svclassifier.predict(X_test))) print('accuracy train = ' ,accuracy_score(Y_train , svclassifier.predict(X_train)))
P[i, j] = clf.decision_function([xi, yj]) # plot the margins ax.contour(X, Y, P, colors='k', levels=[-1, 0, 1], alpha=0.5, linestyles=['--', '-', '--']) plt.subplot(412) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring') plot_svc_decision_function(clf) plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=200, facecolors='none') """ circles """ from sklearn.datasets.samples_generator import make_circles X, y = make_circles(100, factor=.1, noise=.1) clf = SVC(kernel='linear').fit(X, y) plt.subplot(413) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring') plot_svc_decision_function(clf) # clf = SVC(kernel='rbf') clf.fit(X, y) plt.subplot(414) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring') plot_svc_decision_function(clf) plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=200, facecolors='none') plt.show()
# -*- coding: utf-8 -*- """Demo108_PCA_Circles.ipynb # **Tame Your Python** """ import numpy as np import pandas as pd import matplotlib.pyplot as plt import tensorflow as tf print(tf.__version__) from sklearn.datasets.samples_generator import make_circles n = 100 # generate 2d classification dataset X, y = make_circles(n_samples=n) # scatter plot, dots colored by class value df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y)) colors = {0:'red', 1:'blue'} fig, ax = plt.subplots() grouped = df.groupby('label') for key, group in grouped: group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key]) plt.show() datadict = {'X1': X[:,0],'X2' : X[:,1], 'target': y} data = pd.DataFrame(data=datadict) X = data.iloc[:,[0, 1]].values type(X)
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_circles from sklearn.svm import SVC from DecisionBoundary import plot_svm_margin # Creating a toy data with circles X, y = make_circles(100, factor=.1, noise=.1, random_state=88) # plotting the data plt.figure(figsize=[6, 6]) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap=plt.cm.coolwarm) plt.show() # calculating the radius r = np.sum(X**2, axis=1)**0.5 # plotting the data, y-axis replaced with the radius plt.figure(figsize=[6, 6]) plt.scatter(X[:, 0], r, c=y, s=50, cmap=plt.cm.coolwarm) plt.show() # SVM R = np.vstack([X[:, 0], r]).T sv = SVC(kernel='linear', C=10000) sv.fit(R, y) # plotting the margin on the SVM of the transformed data plt.figure(figsize=[6, 6]) plot_svm_margin(R, y, sv) plt.show()
import time import matplotlib import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_circles N=210 K=2 # Maximum number of iterations, if the conditions are not met MAX_ITERS = 1000 cut=int(N*0.7) start = time.time() data, features = make_circles(n_samples=N, shuffle=True, noise= 0.12, factor=0.4) tr_data, tr_features= data[:cut], features[:cut] te_data,te_features=data[cut:], features[cut:] fig, ax = plt.subplots() ax.scatter(tr_data.transpose()[0], tr_data.transpose()[1], marker = 'o', s = 100, c = tr_features, cmap=plt.cm.coolwarm ) plt.plot() points=tf.Variable(data) cluster_assignments = tf.Variable(tf.zeros([N], dtype=tf.int64)) sess = tf.Session() sess.run(tf.initialize_all_variables()) test=[]
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets.samples_generator import make_circles x, y = make_circles( n_samples=1000, noise=0.1, factor=0.2, random_state=0 ) #x.shape # plt.figure(figsize=(5,5)) # #o = formato b = cor # plt.plot(x[y==0,0],x[y==0,1],'ob',alpha=0.5) # plt.plot(x[y==1,0],x[y==1,1],'xr',alpha=0.5) # plt.xlim(-1.5,1.5) # plt.ylim(-1.5,1.5) # plt.legend(['0','1']) # plt.title("Criar um grafico") # #plt.show() from keras.models import Sequential from keras.layers import Dense from keras.optimizers import SGD model = Sequential() model.add(Dense(4,input_shape=(2,),activation='tanh')) model.add(Dense(1,activation='sigmoid')) model.compile(SGD(lr=0.5),'binary_crossentropy',metrics=['accuracy'])
plot_svc_decision_function(clf) plt.show() # Note that a couple of the points touch the lines, these are known as our # support vectors. print clf.support_vectors_ # Visually check the concordance between the coordinates printed above # and the highlighted points in the figure. plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap="spring") plot_svc_decision_function(clf) plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=200, facecolors="none") plt.show() # Only the support vectors matter with an SVM. Moving points wihtout letting # them cross the decision boundaries, it would have no effect. # The SVM becomes more powerful in conjunction with kernels. Let's look # at some data which is not linearly seperable. x, y = make_circles(100, factor=0.1, noise=0.1) clf = SVC(kernel="linear").fit(x, y) plt.scatter(x[:, 0], x[:, 1], c=y, s=50, cmap="spring") plot_svc_decision_function(clf) plt.show() # Clearly no linear separation is going to work on this data. # One way we can adjust to this data is to apply a kernel, which is some # transformation of the input data. We could use the radial basis function. r = np.exp(-(x[:, 0] ** 2 + x[:, 1] ** 2)) # if we plot this alongside our data, we can see the effect. def plot_3D(elev=30, azim=30): ax = plt.subplot(projection="3d") ax.scatter3D(x[:, 0], x[:, 1], r, c=y, s=50, cmap="spring")