Пример #1
0
def use_k_means(x_pca, y):
    k = 10

    y_pred, centroids_pred = k_mean(k, x_pca)

    score = get_score(y, y_pred)
    print(score)

    title = 'k-means-mnist-socre-' + str(score)

    k_show(x_pca, title, y_pred, centroids_pred)
def k_mean_validation(K):
    D = 2
    B = 10000

    KM = km.k_mean("data2D.npy")
    # Required argument: numbers of clusters, dimensions of points, numbers of points
    _, segment_ids, X_data, mu = KM.cluster(K, D, B, 1.0 / 3.0)

    # Take the validation set as input to calculate the loss from cluster centers
    loss, _ = KM.cal_loss(KM.validation.astype(np.float32), mu, D)
    with tf.Session():
        print "K =", K, ":", loss.eval()
def k_mean_validation(K):
    D = 2 
    B = 10000
                    
    KM = km.k_mean("data2D.npy")
    # Required argument: numbers of clusters, dimensions of points, numbers of points
    _, segment_ids, X_data, mu= KM.cluster(K, D, B, 1.0/3.0)
    
    # Take the validation set as input to calculate the loss from cluster centers
    loss,_ = KM.cal_loss(KM.validation.astype(np.float32), mu, D)
    with tf.Session():
        print "K =",K,":",loss.eval()
Пример #4
0
def k_comparison(K):
    D = 2
    B = 10000
                    
    KM = km.k_mean("data2D.npy")
    _, segment_ids, X_data, mu= KM.cluster(K, D, B)
    
    data = tf.ones(shape = [B,])
    division = tf.unsorted_segment_sum(data, segment_ids, K, name=None)
    
    with tf.Session():
        print "K =",K,":",division.eval()/10000
        plot.plot_cluster(segment_ids, X_data, mu, K)
Пример #5
0
def k_comparison(K):
    D = 2
    B = 10000
                    
    KM = km.k_mean("data2D.npy")
    _, segment_ids, X_data, mu= KM.cluster(K, D, B)
    
    
    data = tf.ones(shape = [B,])
    division = tf.unsorted_segment_sum(data, segment_ids, K, name=None)
    
    with tf.Session():
        print "K =",K,":",division.eval()/10000
        plot.plot_cluster(segment_ids, X_data, mu, K)
Пример #6
0
def cluster_articles(supervised=False,
                     target_dimension=10,
                     sentence=False,
                     path_result="result.csv"):
    """ Cluster the articles with a k_mean algorithms, writes the results
    in a csv file and return a list of cluster objects"""

    chemin = "preprocessed_df" + sentence * "_sentence" + ".csv"

    try:
        # Load a preprocessed df
        df = pd.read_csv(chemin, index_col=0)

    except IOError:
        # Build the preprecessed df if it is missing
        print("Missing preprocessed file")
        df = create_df("df_brown.csv", sentence=sentence)
        df.to_csv(chemin)

    if supervised:
        real_clusters = df["real_cluster"]
        df = df.drop("real_cluster", axis=1)

    df = df.drop("text", axis=1)

    if PCA_val:
        # Reduce the dimension with a PCA
        pca = PCA(n_components=target_dimension, svd_solver='full')

        print("Fit and Transform...")
        pca.fit(df)
        print("Transform...")
        df_reduced = pd.DataFrame(pca.transform(df))

    if MDS_val:
        # Reduce the dimension with a MDS
        mds = manifold.MDS(target_dimension, max_iter=100, n_init=1)
        df_reduced = pd.DataFrame(mds.fit_transform(df))

    # Del the initial df
    del df

    # The score at each iteration of the algorithm
    scores = []
    model = None

    centroids = None

    for i in range(N):
        if centroids:
            # Centroids already intialized
            clusters, cost, centroids = k_mean(df_reduced,
                                               number_clusters,
                                               centroids=centroids)
            labels = [value for (key, value) in sorted(clusters.items())]
            score = silhouette_score(df_reduced, labels, metric='cosine')

        if not centroids:
            best_score = 0
            # First iterations, we run the algorithm mutliple times and look for the best centroids
            for _ in range(num_first_tests):
                clusters, cost, centroids = k_mean(df_reduced, number_clusters)
                labels = [value for (key, value) in sorted(clusters.items())]
                score = silhouette_score(df_reduced, labels, metric='cosine')
                if score > best_score:
                    # Best iteratoin, we save the configuration
                    best_centroids = centroids
                    best_score = score
                    best_labels = labels
                    best_clusters = clusters
            centroids = best_centroids
            score = best_score
            labels = best_labels
            clusters = best_clusters

        scores += [score]

        # Pairwise distances between every points
        distances = pairwise_distances(df_reduced, metric='cosine')
        # Silhouette score for each point
        sil_samples = silhouette_samples(distances, labels, metric='cosine')

        # Find the the misclassified elements
        min_elements = choose_min_elements(sil_samples)

        # Load the inital dataset and split in training and testing set
        df_brown = pd.DataFrame.from_csv("df_brown.csv")
        df_test_brown = df_brown.ix[min_elements]
        df_train_brown = df_brown.drop(df_brown.index[min_elements])

        labels_as_dict = {}
        for a in range(len(labels)):
            if a not in min_elements:
                labels_as_dict[a] = labels[a]

        train_Y = pd.DataFrame(labels_as_dict.values())

        labels = pd.get_dummies(train_Y[0])
        # number of clusters in the df
        s = labels.shape[1]

        # Correction when a clusters is deleted by the algorithm
        if s < number_clusters:
            headers = list(labels)
            size_train = df_train_brown.shape[0]
            for i in range(number_clusters):
                if i not in headers:
                    labels[i] = np.array([0] * size_train)

        train_Y.to_csv("train_y_nn.csv")
        df_train_brown.to_csv("train_nn.csv")
        df_test_brown.to_csv("test_nn.csv")

        # Neural network returns prediction, new vector and model
        predictions, new_vectors, model = predict_neural_network(
            df_train_brown, labels, df_test_brown, target_dimension,
            number_clusters, model)

        # Update the vectors in the dataframe
        for j in range(len(min_elements)):
            index = min_elements[j]
            vector = new_vectors[j]
            for k in range(len(vector)):
                df_reduced[k][index] = vector[k]

    # Write the results in a csv file
    predicted_clusters = [value for (key, value) in sorted(clusters.items())]

    df_brown["pred_cluster"] = predicted_clusters
    df_brown.to_csv(path_result)

    # Uncomment to plot the silhouette according to the iterations
    """
    plt.xlabel("Number of clusters")
    plt.ylabel("Silhouette")
    plt.title("Evolution of the silhouette after " + str(N) + " iterations")
    plt.plot(range(N), scores)
    plt.show()
    """

    return clusters, df_reduced
Пример #7
0
def main():
	input=input_pipeline()
	print("ready input pipeline")
	net=k_mean()
	_solver=solver(net,input,'./log')
	_solver.train_and_test()
Пример #8
0
i_time = config.getint('sys','i_time')

with open(complete_file_path,'w') as f:
    f.write("0,"+str(i_time * 11))

with open(os.path.join(file_path,"pid.txt"),'w') as f:
    f.write(str(os.getpid()))

from knn import knn
from ada_boost import ada_boost
from random_forest import random_forest
from logistic import logistic
from svm import svm
from decision_tree import c4_5,cart
from k_mean import k_mean
from xgboost_clf import xgboost
from gbdt_clf import gbdt
from net import net

print('knn:',knn(i_time=i_time))
print('AdaBoost:',ada_boost(i_time=i_time))
print('random forest',random_forest(i_time=i_time))
print('logistic regression:',logistic(i_time=i_time))
print('C4.5:',c4_5(i_time=i_time))
print('cart:',cart(i_time=i_time))
print('k_mean',k_mean(i_time=i_time))
print('xgboost',xgboost(i_time=i_time))
print('gbdt',gbdt(i_time=i_time))
print('SVM:',svm(i_time=i_time))
print('net',net(i_time=i_time))
Пример #9
0
            ratio = random.random()
            raw_cubs.append(
                Lion(selected_male.x * ratio + selected_female.x *
                     (1 - ratio)))
            raw_cubs.append(
                Lion(selected_female.x * ratio + selected_male.x *
                     (1 - ratio)))

        #選擇幾隻突變
        selected_mutation_cubs = random.sample(
            raw_cubs, int(len(raw_cubs) * mutation_ratio))
        for i in selected_mutation_cubs:
            i.mutation(maxvalue=y, minvalue=x)

        #k_mean分群
        common_gender_cubs = k_mean.k_mean(raw_cubs, 2, 10)
        boy_cubs = common_gender_cubs[0]
        girl_cubs = common_gender_cubs[1]

        #刪除多餘弱小的子代
        diff = len(boy_cubs) - len(girl_cubs)
        if diff > 0:
            for i in range(diff):
                boy_cubs.remove(min(boy_cubs, key=attrgetter('fitness')))
        else:
            for i in range(-diff):
                girl_cubs.remove(min(girl_cubs, key=attrgetter('fitness')))

        #領土防禦
        for meow in range(generate_nomad_time):  #產生流浪獅子
            nomads.append(Lion(random.uniform(x, y)))
Пример #10
0
    return(dis)
    
def cost(m,data):   
    cost=0
    for i in range(len(data)):
       dist=euclidean_distance(m,data[i])
       cost=cost+dist
    return(cost)
       
# k_mean......for k=2...............    
kn=4
cluster=[[] for x in range(kn)]
mean= [[] for x in range(kn)]
i=0
k=0
cluster[k],cluster[k+1],mean[k],mean[k+1] = k_mean(my_data)

while(1):
    
   
    cost_dist=[]
    
    for i in range(k+1):
       cost1=cost(mean[i],cluster[i])
       cost_dist.append(cost1)
    a=cost_dist.index(max(cost_dist))
    cluster[a],cluster[k+2],mean[a],mean[k+2]=k_mean(cluster[a])
    k=k+1
    if (kn==k+2):
        break