예제 #1
0
def get_toy_classification_data(n_samples=100, centers=3, n_features=2, type_data = "blobs"):
    # generate 2d classification dataset
    if (type_data == "blobs"):
        X, y = make_blobs(n_samples=n_samples, centers=centers, n_features=n_features)
    elif(type_data == "moons"):
        X, y = make_moons(n_samples=n_samples, noise=0.1)
    elif(type_data == "circles"):
        X, y =  make_circles(n_samples=n_samples, noise=0.05)
    # scatter plot, dots colored by class value
#    df = DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
#    colors = {0:'red', 1:'blue', 2:'green'}
#    fig, ax = pyplot.subplots()
#    grouped = df.groupby('label')
#    for key, group in grouped:
#        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
#    pyplot.show()
    
    X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.25, stratify = None)
    
    classes = np.unique(y_train)
    
    if(0):
        enc = OneHotEncoder().fit(classes.reshape(-1,1))
        
        y_train = enc.transform(y_train.reshape(-1, 1))
        print (y_test)
        y_test = enc.transform(y_test.reshape(-1, 1))
        print (y_test)
    
    y_train = one_hot_encode(y_train, classes)
    y_test = one_hot_encode(y_test, classes)
    
    return  X_train, y_train, X_test, y_test, classes
예제 #2
0
    def __init__(self):
        self.start_time = time.time()

        if (Configuration.data == "Social_Network_Ads.csv"):
            self.dataset = pd.read_csv(str(Configuration.data))

        if (Configuration.algorithm == "linear_regression"):
            self.X = self.dataset.iloc[:, :-1].values
            self.y = self.dataset.iloc[:, 1].values
        elif (Configuration.algorithm == "logistic_regression" or Configuration.algorithm == "svc"
                or Configuration.algorithm == "decision_tree_classification" or Configuration.algorithm == "random_forest_classification" or
                    Configuration.algorithm == "knn"):
            if (Configuration.data=="Social_Network_Ads.csv"):
                self.X = self.dataset.iloc[:, [2,3]].values
                self.y = self.dataset.iloc[:, 4].values
            else:
                if (Configuration.data == "moons"):
                    from sklearn.datasets.samples_generator import make_moons
                    self.X, self.y = make_moons(100, noise=.2, random_state = 0)
                elif (Configuration.data == "circles"):
                    from sklearn.datasets.samples_generator import make_circles
                    self.X, self.y = make_circles(100, factor=.5, noise=.1, random_state = 0)
        elif (Configuration.algorithm == "polynomial_regression"):
            self.X = self.dataset.iloc[:, 1:2].values
            self.y = self.dataset.iloc[:, 2].values
        elif (Configuration.algorithm == "kmeans"):
            self.X = self.dataset.iloc[:, [3, 4]].values
            self.y = None

        if (Configuration.data == "Social_Network_Ads.csv"):
            self.directory = "SocialNetworkAds"
        elif (Configuration.data == "moons"):
            self.directory = "Moons"
        elif (Configuration.data == "circles"):
            self.directory = "Circles"
예제 #3
0
def qa2():
    print("khan")
    X,y =make_moons(n_samples = 300,noise = 0.05)
    print(y)
    #Plot Elbow Graph to find Minimal Optimal Cluisetr
    wcss = []
    for i in range(1,11):
        kmeans1 = KMeans(n_clusters = i, init = 'k-means++',max_iter = 300,n_init = 10,random_state = 0)
        kmeans1.fit(X)
        wcss.append(kmeans1.inertia_)
    """ 
    plt.plot(range(1,11), wcss)
    plt.title("Elbow Method to find minimal No: Cluster")
    plt.xlabel("Number of cluster")
    plt.ylabel("Wcss")
    """
   # plt.show()
    #Now Make Model
    kMeanModel = KMeans(n_clusters = 4)
    kMeanModel.fit(X)
    predictVal = kMeanModel.predict(X)
    print("****Predict Value*****")
    print(predictVal)
    fobj = plt.figure(figsize = (6,6), facecolor = (1,0,1))
    fobj.canvas.set_window_title('Plot Diagram')
    spobj1=fobj.add_subplot(221)
    spobj1.scatter(range(1,11), wcss)
    
    spobj2=fobj.add_subplot(223)
    spobj2.scatter(X[:,0],X[:,1], c=predictVal, cmap = 'viridis')
    plt.show()
예제 #4
0
def plot_class(clf):
    #X_train, y_train = make_blobs(n_samples=200, centers=2,
    #              random_state=2, cluster_std=2.50)
    X_train, y_train = make_moons(200, noise=0.20)

    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    ax.set_xlabel('feature 1', color='gray')
    ax.set_ylabel('feature 2', color='gray')
    ax.scatter(X_train[:, 0],
               X_train[:, 1],
               c=y_train,
               s=100,
               cmap='Paired',
               zorder=3)

    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    x = np.linspace(xlim[0], xlim[1], 50)
    y = np.linspace(ylim[0], ylim[1], 50)
    yy, xx = np.meshgrid(y, x)
    X_test = np.vstack([xx.ravel(), yy.ravel()]).T

    clf.fit(X_train, y_train)
    zz = clf.predict(X_test)
    zz = zz.reshape(xx.shape)
    ax.contourf(xx, yy, zz, cmap='Paired', alpha=0.4, zorder=1)
    plt.show()
예제 #5
0
 def generate_moons_sample(self):
     X, y = make_moons(n_samples=500, noise=0.08)
     y = (y - 0.5) * 2
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.2)
     return X_train, X_test, y_train, y_test
예제 #6
0
def _get_moons(*args, **kwargs):
    X, y = make_moons(n_samples=100, noise=0.1)
    metadata = {
        'regression': False,
        'scoring': classifier_scoring,
        'primary_metric': 'accuracy',
    }
    return X, y, metadata
예제 #7
0
    def get_dataset(self):
        if self.type == 'moon':
            datas, labels = make_moons(n_samples=self.n_samples,
                                       noise=self.noise)
        elif self.type == 'circle':
            datas, labels = make_circles(n_samples=self.n_samples,
                                         noise=self.noise)
        else:
            print('wrong dataset type input.')

        dataset = {}
        dataset['data'] = datas
        dataset['target'] = labels
        return dataset
예제 #8
0
def choose_dataset(chosen_dataset, n_points):
    X = None

    if chosen_dataset == "blobs":
        X = make_blobs(n_samples=n_points,
                       centers=4,
                       n_features=2,
                       cluster_std=1.5,
                       random_state=42)[0]
    elif chosen_dataset == "moons":
        X = make_moons(n_samples=n_points, noise=0.05, random_state=42)[0]
    elif chosen_dataset == "scatter":
        X = make_blobs(n_samples=n_points,
                       cluster_std=[2.5, 2.5, 2.5],
                       random_state=42)[0]
    elif chosen_dataset == "circle":
        X = make_circles(n_samples=n_points, noise=0, random_state=42)[0]

    return X
예제 #9
0
def svm_kernal():
    # creating dataset for checking kernal svm for binary classification
    X, y = make_moons(n_samples=100, noise=0.1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.transform(X_test)
    kernal_pegasos = Kernalpegasos(0.5)
    kernal_pegasos.fit(X_train, y_train, 100000, True, 'rbf')
    classifier = SVC(kernel='rbf', random_state=0, gamma=1, C=1)
    classifier.fit(X_train, y_train)
    # predicting using inbuilt kernal svm
    y_pred = classifier.predict(X_test)
    y_kernal_pred = kernal_pegasos.check_test_points(X_train, y_train, X_test,
                                                     'rbf')
    # print(y_pred)
    # print(y_kernal_pred)
    cm = confusion_matrix(y_test, y_pred)
    # print(cm)
    # print(classification_report(y_test, y_pred))
    # print(classification_report(y_test, y_kernal_pred))
    # plt.scatter(X[y == 1, 0],
    #             X[y == 1, 1],
    #             c='b', marker='x',
    #             label='1')
    # plt.scatter(X[y == 0, 0],
    #             X[y == 0, 1],
    #             c='r',
    #             marker='s',
    #             label='0')

    # plt.xlim([-3, 3])
    # plt.ylim([-3, 3])
    # plt.legend(loc='best')
    # plt.tight_layout()

    plot_decision_regions(X, y, classifier=classifier)
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()
예제 #10
0
"""Prepare an ML model using KMeans algorithm to cluster some sample input
generated using make_moon function. Plot the clusters. Also plot the same
points by clustering it with Spectral Clustering Model.
"""
from sklearn.datasets.samples_generator import make_moons
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
import sklearn.metrics
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

X, y_true = make_moons(n_samples=300, noise=0.05)
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
y_means = kmeans.predict(X)
plt.scatter(X[:, 0], X[:, 1], s=50, c=y_means, cmap='viridis')
#plt.show()

model = SpectralClustering(2, affinity='nearest_neighbors')
labels = model.fit_predict(X)
plt.scatter(X[:, 0], X[:, 1], s=50, c=labels, cmap='viridis')
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
print(tf.__version__)

from sklearn.datasets.samples_generator import make_moons
from sklearn.datasets.samples_generator import make_circles
from sklearn.datasets.samples_generator import make_blobs
# generate 2d classification dataset
n = 10000
X, y = make_moons(n_samples=n, noise=0.1)
# scatter plot, dots colored by class value
df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
colors = {0:'red', 1:'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
plt.show()

datadict = {'X1': X[:,0],'X2' : X[:,1], 'target': y}
data = pd.DataFrame(data=datadict)

X = data.iloc[:,[0, 1]].values
type(X)

y = data.target.values

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
예제 #12
0
파일: dbscan.py 프로젝트: ouriris/ml-course
"""
print(__doc__)

import numpy as np

from sklearn.cluster import DBSCAN

from sklearn import metrics
from sklearn.datasets.samples_generator import make_moons
from sklearn.preprocessing import StandardScaler


# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_moons(n_samples=1000, noise=0.1)


X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

# #############################################################################
예제 #13
0
#from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
from sklearn.datasets.samples_generator import make_moons
from sklearn.cluster import SpectralClustering
import matplotlib.pyplot as plt

#X,y_true=make_blobs(n_samples=300, centers=4, cluster_std=0.6)

X, y_true = make_moons(200, noise=0.05)

#Simple KMeans

kmeans = KMeans(2)
kmeans.fit(X)
y_means = kmeans.predict(X)

plt.scatter(X[:, 0], X[:, 1], s=50, c=y_means)

plt.show()

#Spectral Clustering

model = SpectralClustering(2, affinity='nearest_neighbors')
#model=SpectralClustering(2,affinity='nearest_neighbors',assign_labels='kmeans')
labels = model.fit_predict(X)

#Plot graph

plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
plt.show()
예제 #14
0
sns.lmplot('x', 'y', data=df, fit_reg=False)
plt.show()

# ----- make_blobs -----
from sklearn.datasets.samples_generator import make_blobs

X, y = make_blobs(n_samples=200, centers=4, n_features=2, random_state=101)
df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y))

sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False)
plt.show()

# ----- make_circles -----
from sklearn.datasets.samples_generator import make_circles

X, y = make_circles(n_samples=200, noise=0.1, factor=0.5, random_state=101)
df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y))

sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False)
plt.show()

# ----- make_moons -----
from sklearn.datasets.samples_generator import make_moons

X, y = make_moons(n_samples=200, noise=0.1, random_state=101)
df = pd.DataFrame(dict(x=X[:, 0], y=X[:, 1], label=y))

sns.lmplot('x', 'y', data=df, hue='label', fit_reg=False)
plt.show()
예제 #15
0
                               squared=True)
order = distance.argmin(axis=0)
plt.subplot(122)
for k, col in zip(range(3), colors):              
    my_members = k_means_3_labels == order[k]
    plt.scatter(X[my_members, 0], X[my_members, 1],c=col, marker='o', s=20)           
    cluster_center = k_means_3_cluster_centres[order[k]]
    plt.scatter(cluster_center[0], cluster_center[1], marker = 'o', c=col, s=200, alpha=0.8)            
plt.axis('equal')
plt.title('KMeans 3')

'''
#2: NON-SPHERICAL SHAPES
'''

[X, true_labels] = make_moons(n_samples=1000, noise=.05)

plt.figure(figsize=(12, 6))
plt.suptitle('Non-Spherical Shapes', fontsize=15)
plt.subplot(121)
for k, col in zip(range(2), colors):
    my_members = true_labels == k
    plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20)

plt.axis('equal')
plt.title('Original Data') 
    
# Compute clustering with 2 Clusters
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
예제 #16
0
# you can change them when you are testing your solution but when submitting leave it in the original state
n_samples = 50
C_const = 100
threshold = 1e-3

# generating (almost) linearly separable data, replacing 0 labels with -1
X_blob, Y_blob = make_blobs(n_samples=n_samples,
                            centers=2,
                            random_state=0,
                            cluster_std=1.00)
Y_blob[Y_blob == 0] = -1
# plt.scatter(X_blob[:, 0], X_blob[:, 1], c=Y_blob, s=50, cmap='autumn')
# plt.show()
minimize_and_plot(X_blob, Y_blob, linear_kernel, C_const,
                  threshold)  # svm with linear kernel
minimize_and_plot(X_blob, Y_blob, polynomial_kernel, C_const,
                  threshold)  # svm with polynomial kernel

# generating moon-shaped data, replacing 0 labels with -1
X_moon, Y_moon = make_moons(n_samples=n_samples,
                            shuffle=False,
                            noise=0.10,
                            random_state=0)
Y_moon[Y_moon == 0] = -1
# plt.scatter(X_moon[:, 0], X_moon[:, 1], c=Y_moon, s=50, cmap='autumn')
# plt.show()
minimize_and_plot(X_moon, Y_moon, linear_kernel, C_const,
                  threshold)  # svm with linear kernel
minimize_and_plot(X_moon, Y_moon, polynomial_kernel, C_const,
                  threshold)  # svm with polynomial kernel
예제 #17
0
import networkx as nx

# import warnings
# warnings.filterwarnings("ignore")

##################################################################################################
# # Generating Dataset

# generate 2d classification dataset
X, y = make_blobs(n_samples=120,
                  centers=4,
                  n_features=2,
                  cluster_std=1.8,
                  random_state=42)

X1, y1 = make_moons(n_samples=80, noise=0.05, random_state=42)

varied = make_blobs(n_samples=120,
                    cluster_std=[3.5, 3.5, 3.5],
                    random_state=42)[0]
plt.scatter(varied[:, 0], varied[:, 1])
# plt.gcf().gca().add_artist(plt.Circle((-5, 0), 5, color="red", fill=False, linewidth=3, alpha=0.7))
plt.show()

##################################################################################################
# OPTICS

from algorithms.optics import OPTICS, plot_clust

ClustDist, CoreDist = OPTICS(X, eps=0.5, minPTS=3, plot=True, plot_reach=True)
예제 #18
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_moons

np.random.seed(42)
data, labels = make_moons(n_samples=500, noise=0.1)
colors = ['r' if y else 'b' for y in labels]
print('data.shape =', data.shape, ',  labels.shape =', labels.shape)
plt.scatter(data[:, 0], data[:, 1], c=colors)
# plt.show()


def sigmoid(z):
    return 1 / (1 + np.exp(-z))


def logistic_regression(x, y, learning_rate, num_steps=40):
    ''' Input:  x = the data
                y = the labels
                learning_rate = learning rate
                num_steps = number of iterations
        Output: w = the trained model weights    '''

    # Start by intializing the weights w with w_i = 1 for all i, and make it a 3x1
    # column vector (numpy array).  You can use the numpy function, ones.
    # YOUR CODE HERE
    w = np.ones((np.size(x, 1) + 1, 1))  # MODIFY THIS LINE!

    # Augment x with an initial column of ones for the bias term (the zeroth column of x).
    # You can use the numpy functions ones and hstack to accomplish this.
    # YOUR CODE HERE
예제 #19
0
def generate_two_moons():
    [X, true_labels] = make_moons(n_samples=200, noise=.05)
    return X, true_labels
예제 #20
0
# makes sine wave
#x_t = np.linspace(-1.5*np.pi, 1.5*np.pi, n_point_per_cluster)
x_t = np.linspace(-20, -10, n_point_per_cluster)
y_t = 10 + 5*np.sin(x_t)
for i in range(len(y_t)):
    y_t[i] = y_t[i]+0.0*np.random.randn()
x5 = []
for i in range(len(x_t)):
    xx = []
    xx.append(x_t[i])
    xx.append(y_t[i])
    x5.append(np.array(xx))
                
#makes a moon

x4, y4 = make_moons(n_samples= 3*n_point_per_cluster,shuffle=True, noise=0.01, random_state= 0)
x4 = [15, -15] + 5.0*x4 



line_t = np.linspace(-5, 5, n_point_per_cluster)
line_y1 = []
m = 0.0
c = -13.0
line_noise = 0.5
for i in range(len(line_t)):
    line_y1.append(m*line_t[i] + line_noise*np.random.randn() + c)

line1 = []
for i in range(len(line_y1)):
    xx = []
예제 #21
0
def generate_two_moons():
    [X, true_labels] = make_moons(n_samples=200, noise=.05)
    return X, true_labels
예제 #22
0
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for index, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0],
                    y=X[y == cl, 1],
                    alpha=0.8,
                    c=colors[index],
                    marker=markers[index],
                    label=cl,
                    edgecolor='black')


(X, y) = make_blobs(n_samples=1000, n_features=2, centers=2, cluster_std=1.05)
(X, y) = make_gaussian_quantiles(n_samples=1000, n_features=2, n_classes=3)
(X, y) = make_moons(n_samples=1000)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    stratify=y)

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

svm = SVC(kernel='rbf', C=1.0, gamma=0.5)
svm.fit(X_train_std, y_train)
plot_decision_regions(X_test_std, y_test, svm)
plt.show()
예제 #23
0
        算法流程:
            a. 如果一个样本点的 - 邻域包含多于m个对象,则创建一个p作为核心对象的新簇。
            b. 寻找核心对象的直接密度可达的对象,被合并为一个新的簇。
            c. 直到没有点可以更新簇时算法结束。
            注意:非核心对象是没有直接密度可达的对象的,它们一般构成了簇的边缘。每个簇可包含多个核心对象。
"""

from sklearn.datasets.samples_generator import make_moons
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_circles
import matplotlib.pyplot as plt
import time

# 月亮形展示
x, y_true = make_moons(n_samples=1000, noise=0.15)
plt.scatter(x[:, 0], x[:, 1], c=y_true)
plt.show()

# KMeans
start = time.time()
kmeans = KMeans(init='k-means++', n_clusters=2, random_state=8).fit(x)
end = time.time()
interval = end - start
plt.scatter(x[:, 0], x[:, 1], c=kmeans.labels_)
plt.title('time: %f' % interval)
plt.show()

# DBSCAN
start = time.time()
dbscan = DBSCAN(eps=.1, min_samples=6).fit(x)
예제 #24
0
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib.cm as cm
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets.samples_generator import make_blobs, make_moons, make_circles

x, y = make_moons(n_samples=200, noise=.05, random_state=0)
n_clusters = 2

# klasteryzacja
clusterer = KMeans(n_clusters=n_clusters, init='random', random_state=10)
cluster_labels = clusterer.fit_predict(x, y)

plt.figure()
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
plt.scatter(x[:, 0],
            x[:, 1],
            marker='.',
            s=70,
            lw=0,
            alpha=0.7,
            c=colors,
            edgecolor='k')
plt.title('KMeans')
plt.show()

# klasteryzacja hierarchiczna
linkage_list = ['single', 'average', 'complete', 'ward']
예제 #25
0
def main(selectedDataset="digits", pop_size=100, max_generations=100):

    # a few hard-coded values
    figsize = [5, 4]
    seed = 42
    #	pop_size = 300
    offspring_size = 2 * pop_size
    #	max_generations = 300
    maximize = False
    #	selectedDataset = "circles"
    selectedClassifiers = ["SVC"]

    # a list of classifiers
    allClassifiers = [
        [RandomForestClassifier, "RandomForestClassifier", 1],
        [BaggingClassifier, "BaggingClassifier", 1],
        [SVC, "SVC", 1],
        [RidgeClassifier, "RidgeClassifier", 1],
        #			[AdaBoostClassifier, "AdaBoostClassifier", 1],
        #			[ExtraTreesClassifier, "ExtraTreesClassifier", 1],
        #			[GradientBoostingClassifier, "GradientBoostingClassifier", 1],
        #			[SGDClassifier, "SGDClassifier", 1],
        #			[PassiveAggressiveClassifier, "PassiveAggressiveClassifier", 1],
        #			[LogisticRegression, "LogisticRegression", 1],
    ]

    selectedClassifiers = [classifier[1] for classifier in allClassifiers]

    folder_name = datetime.datetime.now().strftime(
        "%Y-%m-%d-%H-%M") + "-archetypes-" + selectedDataset + "-" + str(
            pop_size)
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    else:
        sys.stderr.write("Error: folder \"" + folder_name +
                         "\" already exists. Aborting...\n")
        sys.exit(0)
    # open the logging file
    logfilename = os.path.join(folder_name, 'logfile.log')
    logger = setup_logger('logfile_' + folder_name, logfilename)
    logger.info("All results will be saved in folder \"%s\"" % folder_name)

    # load different datasets, prepare them for use
    logger.info("Preparing data...")
    # synthetic databases
    centers = [[1, 1], [-1, -1], [1, -1]]
    blobs_X, blobs_y = make_blobs(n_samples=400,
                                  centers=centers,
                                  n_features=2,
                                  cluster_std=0.6,
                                  random_state=seed)
    circles_X, circles_y = make_circles(n_samples=400,
                                        noise=0.15,
                                        factor=0.4,
                                        random_state=seed)
    moons_X, moons_y = make_moons(n_samples=400, noise=0.2, random_state=seed)
    iris = datasets.load_iris()
    digits = datasets.load_digits()
    #	forest_X, forest_y = loadForestCoverageType() # local function
    mnist_X, mnist_y = loadMNIST()  # local function

    dataList = [
        [blobs_X, blobs_y, 0, "blobs"],
        [circles_X, circles_y, 0, "circles"],
        [moons_X, moons_y, 0, "moons"],
        [iris.data, iris.target, 0, "iris4"],
        [iris.data[:, 2:4], iris.target, 0, "iris2"],
        [digits.data, digits.target, 0, "digits"],
        #			[forest_X, forest_y, 0, "covtype"],
        [mnist_X, mnist_y, 0, "mnist"]
    ]

    # argparse; all arguments are optional
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--classifiers",
        "-c",
        nargs='+',
        help="Classifier(s) to be tested. Default: %s. Accepted values: %s" %
        (selectedClassifiers[0], [x[1] for x in allClassifiers]))
    parser.add_argument(
        "--dataset",
        "-d",
        help="Dataset to be tested. Default: %s. Accepted values: %s" %
        (selectedDataset, [x[3] for x in dataList]))

    parser.add_argument("--pop_size",
                        "-p",
                        type=int,
                        help="EA population size. Default: %d" % pop_size)
    parser.add_argument("--offspring_size",
                        "-o",
                        type=int,
                        help="Ea offspring size. Default: %d" % offspring_size)
    parser.add_argument("--max_generations",
                        "-mg",
                        type=int,
                        help="Maximum number of generations. Default: %d" %
                        max_generations)

    # finally, parse the arguments
    args = parser.parse_args()

    # a few checks on the (optional) inputs
    if args.dataset:
        selectedDataset = args.dataset
        if selectedDataset not in [x[3] for x in dataList]:
            logger.info(
                "Error: dataset \"%s\" is not an accepted value. Accepted values: %s"
                % (selectedDataset, [x[3] for x in dataList]))
            sys.exit(0)

    if args.classifiers != None and len(args.classifiers) > 0:
        selectedClassifiers = args.classifiers
        for c in selectedClassifiers:
            if c not in [x[1] for x in allClassifiers]:
                logger.info(
                    "Error: classifier \"%s\" is not an accepted value. Accepted values: %s"
                    % (c, [x[1] for x in allClassifiers]))
                sys.exit(0)

    if args.max_generations: max_generations = args.max_generations
    if args.pop_size: pop_size = args.pop_size
    if args.offspring_size: offspring_size = args.offspring_size

    # TODO: check that min_points < max_points and max_generations > 0

    # print out the current settings
    logger.info("Settings of the experiment...")
    logger.info("Fixed random seed: %d" % (seed))
    logger.info("Selected dataset: %s; Selected classifier(s): %s" %
                (selectedDataset, selectedClassifiers))
    logger.info(
        "Population size in EA: %d; Offspring size: %d; Max generations: %d" %
        (pop_size, offspring_size, max_generations))

    # create the list of classifiers
    classifierList = [x for x in allClassifiers if x[1] in selectedClassifiers]

    # pick the dataset
    db_index = -1
    for i in range(0, len(dataList)):
        if dataList[i][3] == selectedDataset:
            db_index = i

    dbname = dataList[db_index][3]

    X, y = dataList[db_index][0], dataList[db_index][1]
    number_classes = np.unique(y).shape[0]

    logger.info("Creating train/test split...")
    from sklearn.model_selection import StratifiedKFold
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
    listOfSplits = [split for split in skf.split(X, y)]
    trainval_index, test_index = listOfSplits[0]
    X_trainval, y_trainval = X[trainval_index], y[trainval_index]
    X_test, y_test = X[test_index], y[test_index]
    skf = StratifiedKFold(n_splits=3, shuffle=False, random_state=seed)
    listOfSplits = [split for split in skf.split(X_trainval, y_trainval)]
    train_index, val_index = listOfSplits[0]
    X_train, y_train = X_trainval[train_index], y_trainval[train_index]
    X_val, y_val = X_trainval[val_index], y_trainval[val_index]
    logger.info(
        "Training set: %d lines (%.2f%%); test set: %d lines (%.2f%%)" %
        (X_train.shape[0],
         (100.0 * float(X_train.shape[0] / X.shape[0])), X_test.shape[0],
         (100.0 * float(X_test.shape[0] / X.shape[0]))))

    # rescale data
    scaler = StandardScaler()
    sc = scaler.fit(X_train)
    X = sc.transform(X)
    X_trainval = sc.transform(X_trainval)
    X_train = sc.transform(X_train)
    X_val = sc.transform(X_val)
    X_test = sc.transform(X_test)

    for classifier in classifierList:

        classifier_name = classifier[1]

        # start creating folder name
        experiment_name = os.path.join(
            folder_name,
            datetime.datetime.now().strftime("%Y-%m-%d-%H-%M") +
            "-archetypes-evolution-" + dbname + "-" + classifier_name)
        if not os.path.exists(experiment_name): os.makedirs(experiment_name)

        logger.info("Classifier used: " + classifier_name)

        start = time.time()
        solutions, trainAccuracy, testAccuracy = evolveArchetypes(
            X,
            y,
            X_train,
            y_train,
            X_test,
            y_test,
            classifier,
            pop_size,
            offspring_size,
            max_generations,
            number_classes=number_classes,
            maximize=maximize,
            seed=seed,
            experiment_name=experiment_name)
        end = time.time()
        exec_time = end - start

        # only candidates with all classes are considered
        final_archive = []
        for sol in solutions:
            c = sol.candidate
            c = np.array(c)
            y_core = c[:, -1]
            if len(set(y_core)) == number_classes:
                final_archive.append(sol)

        logger.info("Now saving final Pareto front in a figure...")
        pareto_front_x = [f.fitness[0] for f in final_archive]
        pareto_front_y = [f.fitness[1] for f in final_archive]

        figure = plt.figure(figsize=figsize)
        ax = figure.add_subplot(111)
        ax.plot(pareto_front_x,
                pareto_front_y,
                "bo-",
                label="Solutions in final archive")
        ax.set_title("Optimal solutions")
        ax.set_xlabel("Archetype set size")
        ax.set_ylabel("Error")
        ax.set_xlim([1, X_train.shape[0]])
        ax.set_ylim([0, 0.4])
        plt.tight_layout()
        plt.savefig(
            os.path.join(
                experiment_name,
                "%s_EvoArch_%s_pareto.png" % (dbname, classifier_name)))
        plt.savefig(
            os.path.join(
                experiment_name,
                "%s_EvoArch_%s_pareto.pdf" % (dbname, classifier_name)))
        plt.close(figure)

        figure = plt.figure(figsize=figsize)
        ax = figure.add_subplot(111)
        ax.plot(pareto_front_x,
                pareto_front_y,
                "bo-",
                label="Solutions in final archive")
        ax.set_title("Optimal solutions")
        ax.set_xlabel("Archetype set size")
        ax.set_ylabel("Error")
        plt.tight_layout()
        plt.savefig(
            os.path.join(
                experiment_name,
                "%s_EvoArch_%s_pareto_zoom.png" % (dbname, classifier_name)))
        plt.savefig(
            os.path.join(
                experiment_name,
                "%s_EvoArch_%s_pareto_zoom.pdf" % (dbname, classifier_name)))
        plt.close(figure)

        # initial performance
        X_err, testAccuracy, model, fail_points, y_pred = evaluate_core(
            X_trainval,
            y_trainval,
            X_test,
            y_test,
            classifier[0],
            cname=classifier_name,
            SEED=seed)
        X_err, trainAccuracy, model, fail_points, y_pred = evaluate_core(
            X_trainval,
            y_trainval,
            X_trainval,
            y_trainval,
            classifier[0],
            cname=classifier_name,
            SEED=seed)
        logger.info("Compute performances!")
        logger.info("Elapsed time (seconds): %.4f" % (exec_time))
        logger.info("Initial performance: train=%.4f, test=%.4f, size: %d" %
                    (trainAccuracy, testAccuracy, X_train.shape[0]))

        # best solution
        accuracy = []
        for sol in final_archive:
            c = sol.candidate
            c = np.array(c)
            X_core = c[:, :-1]
            y_core = c[:, -1]
            X_err, accuracy_val, model, fail_points, y_pred = evaluate_core(
                X_core,
                y_core,
                X_val,
                y_val,
                classifier[0],
                cname=classifier_name,
                SEED=seed)
            X_err, accuracy_train, model, fail_points, y_pred = evaluate_core(
                X_core,
                y_core,
                X_train,
                y_train,
                classifier[0],
                cname=classifier_name,
                SEED=seed)
            accuracy.append(np.mean([accuracy_val, accuracy_train]))

        best_ids = np.array(np.argsort(accuracy)).astype('int')[::-1]
        count = 0
        for i in best_ids:

            if count > 2:
                break

            c = final_archive[i].candidate
            c = np.array(c)

            X_core = c[:, :-1]
            y_core = c[:, -1]

            X_err, accuracy_train, model, fail_points, y_pred = evaluate_core(
                X_core,
                y_core,
                X_train,
                y_train,
                classifier[0],
                cname=classifier_name,
                SEED=seed)
            X_err, accuracy_val, model, fail_points, y_pred = evaluate_core(
                X_core,
                y_core,
                X_val,
                y_val,
                classifier[0],
                cname=classifier_name,
                SEED=seed)
            X_err, accuracy, model, fail_points, y_pred = evaluate_core(
                X_core,
                y_core,
                X_test,
                y_test,
                classifier[0],
                cname=classifier_name,
                SEED=seed)
            logger.info(
                "Minimal train/val error: train: %.4f, val: %.4f; test: %.4f, size: %d"
                % (accuracy_train, accuracy_val, accuracy, X_core.shape[0]))

            if False:  #(dbname == "mnist" or dbname == "digits") and count == 0:

                if dbname == "mnist":
                    H, W = 28, 28
                if dbname == "digits":
                    H, W = 8, 8

                logger.info("Now saving figures...")

                # save archetypes
                for index in range(0, len(y_core)):
                    image = np.reshape(X_core[index, :], (H, W))
                    plt.figure()
                    plt.axis('off')
                    plt.imshow(image, cmap=plt.cm.gray_r)
                    plt.title('Label: %d' % (y_core[index]))
                    plt.tight_layout()
                    plt.savefig(
                        os.path.join(
                            experiment_name,
                            "digit_%d_idx_%d.pdf" % (y_core[index], index)))
                    plt.savefig(
                        os.path.join(
                            experiment_name,
                            "digit_%d_idx_%d.png" % (y_core[index], index)))
                    plt.close()

                # save test errors
                e = 1
                for index in range(0, len(y_test)):
                    if fail_points[index] == True:
                        image = np.reshape(X_test[index, :], (H, W))
                        plt.figure()
                        plt.axis('off')
                        plt.imshow(image, cmap=plt.cm.gray_r)
                        plt.title('Label: %d - Prediction: %d' %
                                  (y_test[index], y_pred[index]))
                        plt.savefig(
                            os.path.join(
                                experiment_name,
                                "err_lab_%d_pred_%d_idx_%d.pdf" %
                                (y_test[index], y_pred[index], e)))
                        plt.savefig(
                            os.path.join(
                                experiment_name,
                                "err_lab_%d_pred_%d_idx_%d.png" %
                                (y_test[index], y_pred[index], e)))
                        plt.close()
                        e = e + 1

            # plot decision boundaries if we have only 2 dimensions!
            if X.shape[1] == 2:

                cmap = ListedColormap(sns.color_palette("bright", 3).as_hex())
                xx, yy = make_meshgrid(X[:, 0], X[:, 1])
                figure = plt.figure(figsize=figsize)
                _, Z_0 = plot_contours(model, xx, yy, colors='k', alpha=0.2)
                #			plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cmap, marker='s', alpha=0.4, label="train")
                plt.scatter(X_test[:, 0],
                            X_test[:, 1],
                            c=y_test,
                            cmap=cmap,
                            marker='+',
                            alpha=0.3,
                            label="test")
                plt.scatter(X_core[:, 0],
                            X_core[:, 1],
                            c=y_core,
                            cmap=cmap,
                            marker='D',
                            facecolors='none',
                            edgecolors='none',
                            alpha=1,
                            label="archetypes")
                plt.scatter(X_err[:, 0],
                            X_err[:, 1],
                            marker='x',
                            facecolors='k',
                            edgecolors='k',
                            alpha=1,
                            label="errors")
                plt.legend()
                plt.title("%s - acc. %.4f" % (classifier_name, accuracy))
                plt.tight_layout()
                plt.savefig(
                    os.path.join(
                        experiment_name, "%s_EvoArch_%s_%d.png" %
                        (dbname, classifier_name, count)))
                plt.savefig(
                    os.path.join(
                        experiment_name, "%s_EvoArch_%s_%d.pdf" %
                        (dbname, classifier_name, count)))
                plt.close(figure)

                if count == 0:
                    # using all samples in the training set
                    X_err, accuracy, model, fail_points, y_pred = evaluate_core(
                        X_trainval,
                        y_trainval,
                        X_test,
                        y_test,
                        classifier[0],
                        cname=classifier_name,
                        SEED=seed)
                    X_err_train, trainAccuracy, model_train, fail_points_train, y_pred_train = evaluate_core(
                        X_trainval,
                        y_trainval,
                        X_trainval,
                        y_trainval,
                        classifier[0],
                        cname=classifier_name,
                        SEED=seed)

                    figure = plt.figure(figsize=figsize)
                    _, Z_0 = plot_contours(model,
                                           xx,
                                           yy,
                                           colors='k',
                                           alpha=0.2)
                    plt.scatter(X_trainval[:, 0],
                                X_trainval[:, 1],
                                c=y_trainval,
                                cmap=cmap,
                                marker='s',
                                alpha=0.4,
                                label="train")
                    plt.scatter(X_test[:, 0],
                                X_test[:, 1],
                                c=y_test,
                                cmap=cmap,
                                marker='+',
                                alpha=0.4,
                                label="test")
                    plt.scatter(X_err[:, 0],
                                X_err[:, 1],
                                marker='x',
                                facecolors='k',
                                edgecolors='k',
                                alpha=1,
                                label="errors")
                    plt.legend()
                    plt.title("%s - acc. %.4f" % (classifier_name, accuracy))
                    plt.tight_layout()
                    plt.savefig(
                        os.path.join(
                            experiment_name, "%s_EvoArch_%s_alltrain.png" %
                            (dbname, classifier_name)))
                    plt.savefig(
                        os.path.join(
                            experiment_name, "%s_EvoArch_%s_alltrain.pdf" %
                            (dbname, classifier_name)))
                    plt.close(figure)

            count = count + 1

    logger.handlers.pop()

    return
예제 #26
0
        }

        ## Automatic cluster coloring
        cNorm = colors.Normalize(vmin=0, vmax=1)
        scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=color_map)

        for method_name, m in methods.items():
            print_title('+', '%s' % method_name)
            x, y = sg.make_circles(n_samples=n_samples, factor=.5, noise=.05)
            pred_labels = m.fit_predict(x)
            colors = [scalarMap.to_rgba(l) for l in pred_labels]
            plt.scatter(x[:, 0], x[:, 1], c=colors)
            plt.savefig('./plots/circle_%s.pdf' % method_name)
            clear_plt()

            x, y = sg.make_moons(n_samples=n_samples, noise=.05)
            pred_labels = m.fit_predict(x)
            colors = [scalarMap.to_rgba(l) for l in pred_labels]
            plt.scatter(x[:, 0], x[:, 1], c=colors)
            plt.savefig('./plots/moons_%s.pdf' % method_name)
            clear_plt()

    # THIRD EXPERIMENT
    if make_experiment[2]:
        print_title('=', 'THIRD EXPERIMENT')
        data = pd.read_csv('./data/processed.csv')
        x = data.drop(['num'], axis=1)
        experiments = {
            'binary': (2, data.num.apply(lambda x: int(x != 0))),
            'normal': (5, data.num)
        }
예제 #27
0
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets.samples_generator import make_moons
import seaborn as sns

x, y = make_moons(1000, noise=.05, random_state=0)
X_moon = pd.DataFrame(x, columns=['f1', 'f2'])
# metoda klasteryzacji
cost_list = []
k_val = range(2, 11)
for i in k_val:
    kmean_1 = KMeans(n_clusters=i, init='random', n_init=10, max_iter=300, tol=1e-4, random_state=None)
    kmean_1.fit_predict(X_moon)
    cost_list.append(kmean_1.inertia_)

plt.figure(figsize=(6,5))
plt.plot(k_val, cost_list, '-*m')
plt.grid()
plt.xlabel('Number of centoids (clusters)')
plt.ylabel('Cost function')

km = KMeans(n_clusters=2, init='random', n_init=10,
            max_iter=350, tol=1e-4, random_state=None)
# y_km - wynik predykcji
km.fit(X_moon)
y_km = km.predict(X_moon)

# wykres
X_moon['k_means'] = y_km
sns.lmplot(data=X_moon, x='f1', y='f2', fit_reg=False, legend=False, hue='k_means', palette=['#eb6c6a', '#6aeb6c']).set(
예제 #28
0
plt.ion()

# linearly separable with two classes
plt.axis([-15, 5, -12, 12])
points, clusters = make_blobs(n_samples=100,
                              centers=2,
                              n_features=2,
                              cluster_std=2.5,
                              random_state=1)
plot(points, clusters)

# linearly separable with three classes
points, clusters = make_blobs(n_samples=150,
                              centers=[[1, 1], [-1, -1], [1, -2]],
                              n_features=2,
                              cluster_std=0.3,
                              random_state=123)
plot(points, clusters)

# non-linearly separable
points, clusters = make_blobs(n_samples=100,
                              centers=2,
                              n_features=2,
                              cluster_std=3.8,
                              random_state=1)
plot(points, clusters)

# very non-linearly separable
points, clusters = make_moons(n_samples=100, noise=0.1, random_state=1)
plot(points, clusters)
  plt.xlim(X1.min(), X1.max())
  plt.ylim(X2.min(), X2.max())
  for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'blue'))(i), label = j)
  plt.title(text)
  plt.xlabel('X')
  plt.ylabel('y')
  plt.legend()
  plt.show()

"""## Make weird datasets to throw our models off"""

from sklearn.datasets.samples_generator import make_moons
# generate 2d classification dataset
X, y = make_moons(n_samples=1000, noise=0.3)
df = pd.DataFrame(dict(x=X[:,0], y=X[:,1], label=y))
colors = {0:'red', 1:'blue'}
fig, ax = plt.subplots()
grouped = df.groupby('label')
for key, group in grouped:
    group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
plt.show()

datadict = {'X1': X[:,0],'X2' : X[:,1], 'target': y}
data = pd.DataFrame(data=datadict)
X = data.iloc[:, [0,1]].values
y = data.iloc[:, 2].values

# TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
예제 #30
0
# 加入噪声数据
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels'])
df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_classification')
plt.show()


from sklearn.datasets.samples_generator import make_circles
X,labels=make_circles(n_samples=200,noise=0.2,factor=0.2)
df = pd.DataFrame(np.c_[X,labels],columns = ['feature1','feature2','labels'])
df.plot.scatter('feature1','feature2', s = 100, c = list(df['labels']),
cmap = 'rainbow',colorbar = False, alpha = 0.8,title = 'dataset by make_circles')
plt.show()


from matplotlib import pyplot as plt
from sklearn.datasets.samples_generator import make_moons
x1,y1=make_moons(n_samples=1000,noise=0.1)
plt.title('make_moons function example')
plt.scatter(x1[:,0],x1[:,1],marker='o',c=y1)
plt.show()


from sklearn.datasets.samples_generator import make_regression
X,Y,coef = make_regression(n_samples=100, n_features=1, bias=5, tail_strength= 0, noise= 1, shuffle=True, coef=True, random_state=None)
print(coef) #49.08950060982939
df = pd.DataFrame(np.c_[X,Y],columns = ['x','y'])
df.plot('x','y',kind = 'scatter',s = 50,c = 'm',edgecolor = 'k')
plt.show()
    my_members = k_means_3_labels == order[k]
    plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20)
    cluster_center = k_means_3_cluster_centres[order[k]]
    plt.scatter(cluster_center[0],
                cluster_center[1],
                marker='o',
                c=col,
                s=200,
                alpha=0.8)
plt.axis('equal')
plt.title('KMeans 3')
'''
#2: NON-SPHERICAL SHAPES
'''

[X, true_labels] = make_moons(n_samples=1000, noise=.05)

plt.figure(figsize=(12, 6))
plt.suptitle('Non-Spherical Shapes', fontsize=15)
plt.subplot(121)
for k, col in zip(range(2), colors):
    my_members = true_labels == k
    plt.scatter(X[my_members, 0], X[my_members, 1], c=col, marker='o', s=20)

plt.axis('equal')
plt.title('Original Data')

# Compute clustering with 2 Clusters
k_means_2 = KMeans(init='k-means++', n_clusters=2, n_init=10)
k_means_2.fit(X)
k_means_2_labels = k_means_2.labels_
예제 #32
0
from sklearn.datasets.samples_generator import make_moons
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

X, Y = make_moons(n_samples=400, noise=0.05, random_state=0)

# print(X)

plt.scatter(X[:, 0], X[:, 1])
plt.savefig('scatterplot.png')

kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
kmeans_results = kmeans.predict(X)

print(kmeans_results)

plt.scatter(X[:, 0], X[:, 1], c=kmeans_results)
plt.savefig('scatterplot_color.png')