Exemplo n.º 1
0
def medoids(simulation_object, w_samples, b, B=200):
    inputs_set, psi_set, _, _, z = select_top_candidates(
        simulation_object, w_samples, B)

    D = pairwise_distances(psi_set, metric='euclidean')
    M, C = kmedoids.kMedoids(D, b)
    return inputs_set[M, :z], inputs_set[M, z:]
Exemplo n.º 2
0
def main():
    '''do clustering'''
    args = get_args()
    data = []
    with open(args[2]) as json_data:
        for line in json_data:
            tweet = json.loads(line)
            data.append(tweet['text'])

    print('The scikit-learn version is {}.'.format(sklearn.__version__))

    print('distance')
    # distance matrix
    distance = pairwise_distances(data, 'jaccard')

    print('splitting')
    # split into k clusters
    M, clusters = kmedoids.kMedoids(distance, int(args[1]))
    print('split')
    print('medoids:')
    for point_idx in M:
        print(data[point_idx])

    print('')
    print('clustering result:')
    for label in clusters:
        for point_idx in clusters[label]:
            print('label {0}: {1}'.format(label, data[point_idx]))
 def run_kmedoids(self, distances):
     center_indices, labels_dict = kmedoids.kMedoids(
         distances, self.branching_factor)
     labels = np.empty(distances.shape[0]).astype(int)
     for key, value in labels_dict.items():
         labels[value] = key
     return center_indices, labels
Exemplo n.º 4
0
def solver():
    parser = argparse.ArgumentParser()
    parser.add_argument("integer",
                        type=int,
                        help="Please give arguments as 'Centroid','Min','Max'")
    args = parser.parse_args()
    clusters = args.integer

    reader = DataReader()
    data = reader.loadData()
    simMatrix, indexes = genSimilarityMatrix(data)
    M, C = kmedoids.kMedoids(simMatrix, clusters)
    fileWriter = open('data/Kmedoids_output_{}.txt'.format(clusters), 'w')
    print('medoids', file=fileWriter)
    i = 1
    for point in M:
        print('medoid of cluster ', i, ' ', indexes[point], file=fileWriter)
        i = i + 1
    print(' ', file=fileWriter)
    print('clustering result:', file=fileWriter)
    i = 1
    for label in C:
        for point_idx in C[label]:
            print('Cluster ', i, ': ', indexes[point_idx], file=fileWriter)
        i = i + 1

    fileWriter.close()
    print("Clustering Done!!,No. of new clusters are {}".format(clusters))
    print("New clusters are stored in file-data/Kmedoids_output_{}.txt".format(
        clusters))
Exemplo n.º 5
0
def k_medoids(sample, num_clusters):
    # clusters the samples into the number of clusters (num_clusters) according 
    # to the K-Medoids clustering algorithm and returns the medoids and the 
    # samples that belong to each cluster
    D = distance_matrix(sample, sample)
    M, C = kMedoids(D, num_clusters)
    return M, C  
Exemplo n.º 6
0
def consensus_matrix(distance_mx, ks):
    print("Building consensus matrix")

    n, m, m = distance_mx.shape

    cons_mx = np.zeros((ks - 1, m, m))

    for k in range(2, ks):
        count = 1
        for node in distance_mx:
            print("Clustering for k = " + str(k) + " node " + str(count))

            _, clusters = km.kMedoids(node, k)

            for value in clusters.values():
                pairs = list(combinations(value, 2))
                for ij in pairs:
                    i, j = ij
                    cons_mx[k - 2][i][j] += 1
                    cons_mx[k - 2][j][i] += 1

            count += 1

        cons_mx[k - 2] = cons_mx[k - 2] / float(n)
        cons_mx[k - 2] = cons_mx[k - 2] / float(k)

    cons_mx = np.sum(cons_mx, axis=0)
    print("...built!")

    return cons_mx
Exemplo n.º 7
0
def main():
    '''do clustering'''
    args = get_args()
    data = []
    with open(args[2]) as json_data:
        for line in json_data:
            tweet = json.loads(line)
            data.append(tweet['text'])

    print('The scikit-learn version is {}.'.format(sklearn.__version__))

    print('distance')
    # distance matrix
    distance = pairwise_distances(data, 'jaccard')

    print('splitting')
    # split into k clusters
    M, clusters = kmedoids.kMedoids(distance, int(args[1]))
    print('split')
    print('medoids:')
    for point_idx in M:
        print(data[point_idx])

    print('')
    print('clustering result:')
    for label in clusters:
        for point_idx in clusters[label]:
            print('label {0}: {1}'.format(label, data[point_idx]))
def clust_creation(x):
    #    T0=time()
    windscen = {}
    windscen[1] = pd.read_csv(WindScen_file_1, index_col=0)
    windscen[2] = pd.read_csv(WindScen_file_2, index_col=0)
    windscen[3] = pd.read_csv(WindScen_file_3, index_col=0)
    windscen[4] = pd.read_csv(WindScen_file_4, index_col=0)
    windscen[5] = pd.read_csv(WindScen_file_5, index_col=0)
    windscen[6] = pd.read_csv(WindScen_file_6, index_col=0)
    windscen[7] = pd.read_csv(WindScen_file_7, index_col=0)
    windscen[8] = pd.read_csv(WindScen_file_8, index_col=0)
    windscen[9] = pd.read_csv(WindScen_file_9, index_col=0)
    windscen[10] = pd.read_csv(WindScen_file_10, index_col=0)
    windscen[11] = pd.read_csv(WindScen_file_11, index_col=0)
    windscen[12] = pd.read_csv(WindScen_file_12, index_col=0)
    windscen[13] = pd.read_csv(WindScen_file_13, index_col=0)
    windscen[14] = pd.read_csv(WindScen_file_14, index_col=0)
    windscen[15] = pd.read_csv(WindScen_file_15, index_col=0)
    windinfo = pd.read_csv(windfarms_file, index_col=0)
    windfarms = windinfo.index.tolist()
    scenprob_init = {s: 1.0 / NScen for s in range(1, NScen + 1)}
    timeseries = []
    for k in range(len(windfarms)):
        timeseries.append([])
        for i in range(1, NScen + 1):
            timeseries[k].append(windscen[k + 1]['{0}'.format(i)].values)
    time_series = []
    for i in range(1, NScen + 1):
        l = list()
        for k in range(len(windfarms)):
            l += timeseries[k][i - 1].tolist()
        time_series.append(l)
    #use k-medoid to generate the clusters and the centroids (significant scenario of every cluster)
    n_clusters = x
    D = pairwise_distances(np.array(time_series), metric='euclidean')
    M, C = kmedoids.kMedoids(
        D, n_clusters
    )  #M is a list of medoids and C a list of cluster (repartition of the scenarios in the clusters, only number (ID) of scenario not data)
    cluster = []
    for i in range(n_clusters):
        cluster.append(list(C[i]))
    medoid_prob = {
        p: sum(scenprob_init[i + 1] for i in cluster[p])
        for p in range(len(cluster))
    }
    scenprob = {}
    clusters = []
    for i in range(len(cluster)):
        add_med = list(M)
        add_med.pop(i)
        clusters.append(cluster[i] + add_med)
        scenprob[i] = {}
        for j in clusters[i]:
            if (j in cluster[i]):
                scenprob[i][j + 1] = scenprob_init[j + 1]
            else:
                scenprob[i][j + 1] = medoid_prob[list(M).index(j)]
    return (clusters, scenprob)
def boundary_medoids(simulation_object, w_samples, b, B=200):
    inputs_set, psi_set, _, _, z = select_top_candidates(simulation_object, w_samples, B)

    hull = ConvexHull(psi_set)
    simplices = np.unique(hull.simplices)
    boundary_psi = psi_set[simplices]
    boundary_inputs = inputs_set[simplices]
    D = pairwise_distances(boundary_psi, metric='euclidean')
    M, C = kmedoids.kMedoids(D, b)
    
    return boundary_inputs[M, :z], boundary_inputs[M, z:]
Exemplo n.º 10
0
def partitional_approach(frame_instances, percentage=10):
    ''' Find prototypical frame instances using partitional clustering approach '''
    condensed_matrix, instance_indexes = create_distance_matrix(
        frame_instances)
    num_clusters = len(frame_instances) / percentage + 1
    medoids, clusters = kmedoids.kMedoids(squareform(condensed_matrix),
                                          num_clusters)
    medoid_instances = {}

    for point_index in medoids:
        frame_id = instance_indexes[point_index]
        medoid_instances[frame_id] = format_instance(frame_instances[frame_id])

    return medoid_instances
Exemplo n.º 11
0
def main():
    '''do clustering'''
    args = get_args()
    data = get_data(args, 2)
    data, classes = extract(data, 0)
    #data = fit_encode(data)
    data = np.array(data)

    # distance matrix
    distance = pairwise_distances(data, metric='euclidean')

    # split into k clusters
    M, clusters = kmedoids.kMedoids(distance, int(args[1]))

    print('centers:')
    for point_idx in M:
        print(data[point_idx])
Exemplo n.º 12
0
def main():
    '''do clustering'''
    args = get_args()
    data = get_data(args, 2)
    data, classes = extract(data, 0)
    #data = fit_encode(data)
    data = np.array(data)

    # distance matrix
    distance = pairwise_distances(data, metric='euclidean')

    # split into k clusters
    M, clusters = kmedoids.kMedoids(distance, int(args[1]))

    print('centers:')
    for point_idx in M:
        print(data[point_idx])
Exemplo n.º 13
0
def kmedoid(attributes, ids, n=2):
    # dimension reduction
    data = np.array(attributes)
    reduced_data = PCA(n_components=2).fit_transform(data)

    D = pairwise_distances(reduced_data, metric='euclidean')

    # split into 2 clusters
    # #M store the points that is regarded as center
    M, C = kmedoids.kMedoids(D, n)

    group_members = [[] for i in range(n)]
    for i in range(n):
        for j in C[i]:
            group_members[i].append(ids[j])
    show_kmedoid(M, C, reduced_data)
    # return kmeans result and ids of patients for tracing
    return group_members, M, C, reduced_data
Exemplo n.º 14
0
def kmedoid(attributes, ids, n=2):
    # dimension reduction
    data = np.array(attributes)
    reduced_data = PCA(n_components=2).fit_transform(data)

    D = pairwise_distances(reduced_data, metric='euclidean')

    # split into 2 clusters
    # #M store the points that is regarded as center
    M, C = kmedoids.kMedoids(D, n)

    group_members = [[] for i in range(n)]
    for i in range(n):
        for j in C[i]:
            group_members[i].append(ids[j])
    show_kmedoid(M, C, reduced_data)
    # return kmeans result and ids of patients for tracing
    return group_members, M, C, reduced_data
    def __init__(self, n = 20):
        self.data_matrix, self.items, self.features, self.sim_items, self.sim_feats = get_data()

        k = n
        if k > 200: k = 200

        for i in range(500):
            try:
                medoids, clusters = kMedoids(self.sim_items, k)
                break
            except:
                continue
                
        else:
            print "medoids failed"
            medoids = [0]
            clusters = {0:range(len(self.items))}

        super(GoodN, self).__init__(9, k, medoids)
Exemplo n.º 16
0
def kmedoids_active_learning(xtrain, ytrain, xact_sort, yact_sort, cut, n):
    from sklearn.metrics.pairwise import pairwise_distances
    import kmedoids
    print('kmedoids')
    result = np.zeros(n)
    for i in range(1, n):
        xact_sort = xact_sort[:cut, :]
        yact_sort = yact_sort[:cut]
        D = pairwise_distances(xact_sort, metric='euclidean')
        M, C = kmedoids.kMedoids(D, i)
        xact_medoids = xact_sort[M, :]
        yact_medoids = yact_sort[M]
        xtrain_new = np.concatenate((xtrain, xact_medoids), axis=0)
        ytrain_new = np.concatenate((ytrain, yact_medoids), axis=0)
        act_learn = svm.SVC(kernel='linear', C=1)
        act_learn.fit(xtrain_new, ytrain_new)
        score_km = act_learn.score(xtest, ytest)
        result[i] = score_km
        print(score_km)
    return (result)
Exemplo n.º 17
0
    def k_medoids_clust(self, data, dist_matrix, num_iter):
        # def k_medoids_cluster(self, data, dist_matrix, num_iter, w, r, verbose=True):

        # Turn dist_matrix from upper-triangle to full
        for i in range(len(data)):
            for j in range(0, i):
                dist_matrix[i][j] = dist_matrix[j][i]

        import kmedoids
        M, C = kmedoids.kMedoids(dist_matrix, self.num_clust, num_iter)

        # Wrap up, get in same format as kmeans
        self.medoids = []
        for c_ts_idx in M:
            self.medoids.append(data[c_ts_idx])

        self.ts_dists = defaultdict(dict)
        self.assignments = defaultdict(list)
        for c in C:  # just 0, 1, 2, .... k-1
            # print c_ts_idx
            c_ts_idx = M[c]
            for ts_idx in C[c]:
                self.assignments[c].append(data[ts_idx])
                self.ts_dists[c][ts_idx] = max(dist_matrix[c_ts_idx][ts_idx],
                                               dist_matrix[ts_idx][c_ts_idx])

        # Even though whole point of medoids is to avoid Euclidean mean-based centroids, I think it is still
        # nice to show the 'mean' of the curves of one cluster for kmedoids to produce smoother representations
        # of each curve
        self.centroids = []
        for c in C:
            cur_centroid = np.zeros(data.shape[1])
            for ts_idx in C[c]:
                cur_centroid += data[ts_idx]
            cur_centroid /= len(C[c])
            self.centroids.append(cur_centroid)
Exemplo n.º 18
0
def clusterKMedoids(shooterMeans, max_clusters=10):
    medoids = list()
    clusterings = list()
    performances = list()
    n, _ = shooterMeans.shape
    D = pairwise_distances(shooterMeans)

    for k in range(1, max_clusters + 1):
        print(k)
        m, c = kmedoids.kMedoids(D, k, 10000)

        medoids.append(m)

        # Cluster output of kMedoids is dictionary {cid: [x where c(x)==cid]}
        # Transform to list of cluster id for similarity with kMeans
        labels = [-1] * n
        for label in c:
            for idx in c[label]:
                labels[idx] = label

        labels = np.array(labels)  # Transform so that np.nonzero works

        wgss = 0
        for label in range(k):
            indices = np.nonzero(labels == label)[0]
            for combo in combinations(indices, 2):
                v1 = shooterMeans.loc[combo[0]]
                v2 = shooterMeans.loc[combo[1]]
                d = [v1[name] - v2[name] for name in shooterMeans.columns]
                dist = np.linalg.norm(d)**2
                wgss += dist

        clusterings.append(labels)
        performances.append(wgss)

    return clusterings, performances, medoids
Exemplo n.º 19
0
g = sent_tokenize(f)

summary = open("summary.txt", "w")
op = open("med.txt", "w")
text = open("sentcode.txt", "r")
data = []
size = int(np.shape(vectorop)[0] / 4)
for line in text:
    data.append(line.strip().split())
data = np.asarray(data)
index = []
# distance matrix
D = pairwise_distances(data, metric='euclidean')

# split into size clusters
M, C = kmedoids.kMedoids(D, size)

print('medoids:')
for point_idx in M:
    print(data[point_idx], file=op)
    with open("sentcode.txt") as myFile:
        for num, line in enumerate(myFile, 1):
            if data[point_idx][1] in line:
                index.append(num)

index.sort()
print(index)

for i in index:
    print(g[i - 1], file=summary)
Exemplo n.º 20
0
    for e1 in r1:
        e1min = 100
        for e2 in r2:
            if area_dist[unique_area_idx[e1], unique_area_idx[e2]] < e1min:
                e1min = area_dist[unique_area_idx[e1], unique_area_idx[e2]]
        dist += e1min
    return dist


researcher_dist = np.ones((len(aid), len(aid)))

for x1, d1 in enumerate(aid):
    for x2, d2 in enumerate(aid):
        researcher_dist[x1, x2] = manhattan(d1, d2)

researcher_dist = np.maximum(researcher_dist, researcher_dist.T)

medoids, clusters = kmedoids.kMedoids(researcher_dist, n_clusters)

output = {}

for c in clusters.values():
    group = []
    for d in c:
        group += [rid[d]]
    for d in c:
        output[rid[d]] = list(set(group) - set([rid[d]]))

with open('data.json', 'w') as outfile:
    json.dump(output, outfile)
def compute_sankey(results_search, n_max_clusters, n_min_clusters, n_repet_assess_cluster_number, List_actions, day_selected):
    liste_resfinal = results_search
    
    webpages = [x[1] for x in liste_resfinal]
    flattened_webpages = [item for sublist in webpages for item in sublist]
    flattened_webpages = list(set(flattened_webpages))
    
    
    features = pd.DataFrame(index = flattened_webpages)
    
    distance_matrix = [[0 for i in range(len(flattened_webpages))] for i in range(0, len(flattened_webpages))]
    
    for i in range(0, len(flattened_webpages)):
        for j in range(i+1, len(flattened_webpages)):
            
            x_page_name = delete_first_tag(flattened_webpages[i])
            y_page_name = delete_first_tag(flattened_webpages[j])
            distance_matrix[i][j] = ((distance(x_page_name,y_page_name)))
            distance_matrix[j][i] = distance_matrix[i][j]
                    
    distance_matrix = np.array(distance_matrix)
    n_max = n_max_clusters
    n_min = n_min_clusters
    range_n_clusters = [i for i in range(n_min,n_max)]
    silhouette_avg_scores = [0 for i in range(n_min, n_max)]
    
    for j in range(0,n_repet_assess_cluster_number):
        
        for n_clusters in range_n_clusters:
            
            medoids, clusterer= kmedoids.kMedoids(distance_matrix, n_clusters)
            
            cluster_labels = [0 for i in range(len(distance_matrix))]
            
            for label in clusterer :
                for point_idx in clusterer[label]:
                    cluster_labels[point_idx] = label
                    
            silhouette_avg = silhouette_score(distance_matrix, cluster_labels, metric="precomputed")
            print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)
            silhouette_avg_scores[range_n_clusters.index(n_clusters)]+=silhouette_avg
    
    silhouette_avg_scores = (np.array(silhouette_avg_scores)/(n_repet_assess_cluster_number)).tolist()
    
    cluster_number = range_n_clusters[silhouette_avg_scores.index(max(silhouette_avg_scores))]
    medoids, clusters = kmedoids.kMedoids(distance_matrix, cluster_number)
    
    labels = [0 for i in range(len(distance_matrix))]
    
    for label in clusters :
        for point_idx in clusters[label]:
            labels[point_idx] = label
    features['labels'] = labels
    data_nodes = find_clusters_names(labels,features)
        
    label_to_process = 0
    
    for x in data_nodes :
        if "devis" in x :
            label_to_process = data_nodes.index(x)
        else :
            label_to_process =0
    
    ##Compute the clusterized Sankey diagramm -###########################################################
    
    
    #Initial computation with every nodes and every flux
    
    colors = []
    sources = []
    targets = []
    values= []
    links = []
    
    for sublist in List_actions:
        for i in range(0,len(sublist)-1):
            src_webpage = sublist[i]
            trg_webpage = sublist[i+1]
            if (src_webpage in flattened_webpages) and (trg_webpage in flattened_webpages):
                src_label = features.loc[src_webpage,'labels']
                trg_label= features.loc[trg_webpage, 'labels']
                
                
                if (src_label, trg_label) not in links:
                                          
                    links.append((src_label, trg_label))
                    values.append(1)
                    sources.append(src_label)
                    targets.append(trg_label)
                    
                else:
                    
                    values[links.index((src_label, trg_label))]+=1
    
    
    #clean up the Sankey a bit: remove bidirectional edges between immediately close nodes
     
    cleaned_values=[]
    cleaned_sources = []
    cleaned_targets = []
    sum_in_links = [0 for i in range(max(labels)+1)]
    sum_out_links = [0 for i in range(max(labels)+1)]
    
    for (src_label, trg_label) in links:
        if values[links.index((src_label, trg_label))] > 100:
            
                    
            if (trg_label, src_label) in links:
                
                if values[links.index((src_label, trg_label))] >= values[links.index((trg_label, src_label))]:
                    
                    
                    cleaned_values.append(values[links.index((src_label, trg_label))])
                    cleaned_sources.append(src_label)
                    cleaned_targets.append(trg_label)
                    sum_in_links[trg_label]+= values[links.index((src_label, trg_label))]
                    sum_out_links[src_label] += values[links.index((src_label, trg_label))]
                
            
            else:
                
                cleaned_values.append(values[links.index((src_label, trg_label))])
                cleaned_sources.append(src_label)
                cleaned_targets.append(trg_label)
                sum_in_links[trg_label]+= values[links.index((src_label, trg_label))]
                sum_out_links[src_label] += values[links.index((src_label, trg_label))]
    
    
    cleaned_val_V2 = []
    cleaned_src_V2 = []
    cleaned_trg_V2= []
    rate = 0.10
        
    for i in range(0,len(cleaned_values)):
        if (cleaned_values[i] > rate*sum_in_links[cleaned_targets[i]] and cleaned_values[i] > rate*sum_out_links[cleaned_sources[i]] and (cleaned_sources[i]!=label_to_process or cleaned_targets[i]==label_to_process)):
            cleaned_val_V2.append(cleaned_values[i])
            cleaned_src_V2.append(cleaned_sources[i])
            cleaned_trg_V2.append(cleaned_targets[i])
        
        
    
    
    #plot the final Sankey
                    
    for i in range (0, len(labels)):
        color_array = list(np.random.choice(range(256), size = 3))
        colors.append("rgba(" + str(color_array[0]) + ", " + str(color_array[1]) + ", " + str(color_array[2]) + ", 0.8 )")
                
    
    data_trace = dict(
        type='sankey',  
        orientation = "h",
        valueformat = ".0f",
        valuesuffix = " logs",
        textfont = dict(
                size = 12
        ),
        node = dict(
          pad = 22,
          thickness = 15,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label =  data_nodes
        ),
        link = dict(
          source =  cleaned_src_V2,
          target =  cleaned_trg_V2,
          value =  cleaned_val_V2,
          label =  ["" for x in cleaned_val_V2]
      ))                 
    
    layouts = dict(
        title = "Dynamique du traffic pertinent sur le site credit-agricole.fr le "+ str(day_selected) +" - clustering fonctionnel",
        font = dict(
          size = 10
        ),
        width = 1750,
        height = 800
    )   
    
    res = dcc.Tab(id='Graph_function', children =[
            
            
            dcc.Graph(
                    id = 'Sankey_function',
                    figure = {
                            'data' : [data_trace],
                            'layout' : layouts
                            }                                                    
                    )                                                
            ])
    
    
    return res
Exemplo n.º 22
0
import numpy as np
from kmedoids import kMedoids
from sklearn.metrics.pairwise import pairwise_distances

a = np.load(open('feats_saved_10k.bn', 'rb'))

already_sel = np.load(open('selected10000.bn', 'rb'))
remaining = np.setdiff1d(np.array(range(50000)), already_sel)
D = pairwise_distances(a[remaining, :], metric='euclidean')
M, C = kMedoids(D, 5000)
nd = np.array(list(already_sel) + list(M))
np.save(open('selected15000.bn', 'wb'), nd)
Exemplo n.º 23
0
def _build_clusters(self,clust_num,method):
    timeseries=[]
    for k in range(len(self.data.windfarms)):
        timeseries.append([])
        for i in range(1,NScen+1):
#            print(k,i)
#            print(self.data.windscen[k+1]['{0}'.format(i)].values)
            timeseries[k].append(self.data.windscen[k+1]['{0}'.format(i)].values)
    time_series=[]
    for i in range(1,NScen+1):
        l=list()
        for k in range(len(self.data.windfarms)):
            l+=timeseries[k][i-1].tolist()
        time_series.append(l)
        
            
            
 #k-shape from kshape.core           
    if method=='k_shape':    
#selection of the number of clusters that should be done
        cluster_num =clust_num
#apply clustering method
        cluster = kshape(zscore(time_series, axis=1), cluster_num)
        self.clusters=[]
        for k in range(len(cluster)):
            self.clusters.append(cluster[k][1])
#kshape from tslearn (recommended by Paparrizos)
#        from tslearn.clustering import KShape
#        from tslearn.utils import to_time_series_dataset
#        formatted_dataset = to_time_series_dataset(time_series)
#        ks=ks=KShape(n_clusters=cluster_num, verbose=False)
#        y_pred=ks.fit_predict(formatted_dataset)
#        self.clusters=[]
#        for n in range(cluster_num):
#            self.clusters.append([])
#        for k in range(NScen):
#            self.clusters[y_pred[k]].append(k)
        

    if method=='k_means':
#k-means clustering 
        n_clusters=clust_num
        kmeans = KMeans(n_clusters, random_state=0).fit(time_series)
        kmeans.labels_
        self.clusters=[]
        for n in range(n_clusters):
            self.clusters.append([])
        k=0
        for i in range(NScen):
            k+=1
            self.clusters[kmeans.labels_[i]].append(k-1)
        
    if method=='hierar':
#hierarchical clustering
        n_clusters=clust_num
        cluster = AgglomerativeClustering(n_clusters, affinity='euclidean', linkage='ward').fit_predict(time_series)  
        self.clusters=[]
        for n in range(n_clusters):
            self.clusters.append([])
        k=0
        for i in range(NScen):
            k+=1
            self.clusters[cluster[i]].append(k-1) 
    
    if method=='k_medoids':
#k-medoids clustering
        n_clusters=clust_num
        D = pairwise_distances(np.array(time_series), metric='euclidean')
        M, C = kmedoids.kMedoids(D, n_clusters) #M is a list of medoids and C a list of cluster (repartition of the scenarios in the clusters, only number (ID) of scenario not data)
        self.clusters=[]
        self.medoids=list(M)
        for i in range(n_clusters):
            self.clusters.append(list(C[i]))
Exemplo n.º 24
0
        scaler = RobustScaler()
    elif (sys.argv[4] == '5'):
        scaler = Normalizer()
    data = scaler.fit_transform(data)

print(data[0:5])

from sklearn.metrics.pairwise import pairwise_distances
D = pairwise_distances(data, metric='euclidean', n_jobs=1)
print("Pairwise shape : ")
print(D.shape)
# np.save('all_data.npy', D)
print("Done creating distance matrix, start the algorithm")

# split into 2 clusters
M, C = kmedoids.kMedoids(data, D, int(sys.argv[2]))

st = ''
print('medoids:')
for point_idx in M:
    print(data[point_idx])
    st = st + str(point_idx) + '\n'
f = open(sys.argv[3], 'w')
f.write(st)

st = ''
print('')
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print('label {0}: {1}'.format(label, point_idx))
Exemplo n.º 25
0
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

import kmedoids

# 3 points in dataset
data = np.array([[1,1], 
                [2,2], 
                [10,10]])

# distance matrix
D = pairwise_distances(data, metric='euclidean')

# split into 2 clusters
M, C = kmedoids.kMedoids(D, 2)

print('medoids:')
for point_idx in M:
    print( data[point_idx] )

print('')
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print(label, data[point_idx])





^^^()()^^^
Exemplo n.º 26
0
k_min = 165  #Minimum number of cluster
k_max = 198  #maximum number of cluster
radius = 500

print("Radius ", radius, "k_min ", k_min, "k_max ", k_max)
max_gen = 150

#Initialization

#Initial population is set. . Each individual in the population is determined randomly
D = pairwise_distances(data, metric='euclidean')
#Now we shall create initial population consisting of a certain number of individuals
population = []
for i in range(population_size):
    no_of_cluster = np.random.randint(k_min, k_max + 1)
    M, C = kmedoids.kMedoids(D, no_of_cluster)
    medoid = []
    for item in M:
        medoid.append(data[item])
    if medoid not in population:
        population.append(medoid)

gen_no = 0

#while loop runs till maximum generation
while (gen_no < max_gen):
    coverage = [
        calculate_coverage(population[i]) for i in range(0, population_size)
    ]
    tl = [tour_length(population[i]) for i in range(0, population_size)]
    non_dominated_sorted_population = fast_non_dominated_sort(
Exemplo n.º 27
0
def k_medoids(sample, num_clusters):
    D = scipy.spatial.distance_matrix(sample, sample)
    M, C = kMedoids(D, num_clusters)
    return M, C
Exemplo n.º 28
0
plt.title('MDS avec la matrice des distances entre PI')
plt.show()
print(
    "temps écoulé pour calculer la matrice des distances entre diagrammes : ",
    timeSpent)

nbclusters = 6
#k-medoid classification with persistance diagramm

#we launch k-medoid nIniatialisation time
nInitialisation = 1000

errorFinal = 10**25
for i in range(nInitialisation):
    errorTot = 0
    results = kMedoids(dist_mat, nbclusters)
    clusters = results[1]
    for i in range(nbclusters):
        cluster = pi.getIndivInCluster(clusters, i, label_color)
        error = pi.errorInCluster(cluster, nbclusters)
        errorTot = errorTot + error
        #print("error in cluster i", error)
        #print("individuals in cluster : ", i, cluster)
    if (errorFinal > errorTot):
        errorFinal = errorTot
        clusterFinal = clusters
print("taux d'erreur : ", errorFinal / (nbclusters * nbIndivByClass) * 100)

homologyDegree = 1
sigma2 = 0.0001
b = 0.02
Exemplo n.º 29
0
        kmeans = KMeans(init='k-means++',
                        n_clusters=args.clusters,
                        n_init=args.clusters,
                        max_iter=100)
        labels = kmeans.fit_predict(sbg)
        silscore = silhouette_score(sbg, labels)
        cname = 'kmeans_' + str(i)
        kmeansObject = utils.cmethod(cname, labels, silscore, 0.0,
                                     args.maxfract)
        methods.append(kmeansObject)

    print >> sys.stderr, passedTime(
        start, time.time()), "KMEDOIDS  (probabilistic, YMMV)"
    # Same issue as Kmeans. Same approach
    for i in xrange(1, 6):
        medoids, clusterinfo, labels = kmedoids.kMedoids(
            s_distance, args.clusters)
        silscore = silhouette_score(s_distance, labels)
        cname = 'kmedoids_' + str(i)
        kmedObject = utils.cmethod(cname, labels, silscore, 0.0, args.maxfract)
        methods.append(kmedObject)

#####  Create consistent sample groups  ######################################################
# This outputs a list of samples that always occur together in a cluster, no matter which method is used
# It also adds a 'shared' column to the clusters output file. An attempt is made to give similar clusters
# similar labels, so that cluster B1 largely contains the same samples in each cluster method.

if args.print_groups:
    print >> sys.stderr, passedTime(
        start, time.time()), "Finding consistent groups in all methods used"
    setlist = [i.dups for i in methods if i.ok]
    grouplist1, tally = utils.persistent_groups(copy.copy(setlist),
Exemplo n.º 30
0
# coding: utf-8
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

import kmedoids

# 3 points in dataset
data = np.array([[1,1], 
                [2,2], 
                [10,10]])

# distance matrix
D = pairwise_distances(data, metric='euclidean')

# split into 2 clusters
M, C = kmedoids.kMedoids(D, 2)

print('medoids:')
for point_idx in M:
    print( data[point_idx] )

print('')
print('clustering result:')
for label in C:
    for point_idx in C[label]:
        print('label {0}: {1}'.format(label, data[point_idx]))


Exemplo n.º 31
0
        #print(predict)
        print("purity:", purity(predict['predict'], target))

        # concatenate labels to df as a new column
        r = pd.concat([data, predict], axis=1)
        # plot the cluster assignments
        plt.scatter(r['Life expectancy at birth, total (years)'],
                    r['GNI (constant 2010 US$)'],
                    c=r['predict'],
                    cmap="plasma")
        plt.show()
        print()
#kmedoids model
distances = pairwise_distances(data, metric='euclidean')

M, C = kmedoids.kMedoids(distances, 4)
predict = np.zeros(len(data))
for label in C:
    for point_idx in C[label]:
        predict[point_idx] = label

predict = pd.DataFrame(predict)
predict.columns = ['predict']

print("purity:", purity(predict['predict'], target))

plt.scatter(data['Life expectancy at birth, total (years)'],
            data['GNI (constant 2010 US$)'],
            c=predict['predict'],
            cmap="plasma")
plt.show()
Exemplo n.º 32
0
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
import kmedoids

W0 = np.load("W0_10d.npy")
# distance matrix
D = pairwise_distances(W0, metric='euclidean')
# split into 60 clusters
M, C = kmedoids.kMedoids(D, 60)

C_label = np.zeros(
    35390)  # 35390 = 17695*2 (number of genes from both networks)

for label in C:
    for point_idx in C[label]:
        C_label[point_idx] = label

np.save("kmedoids.npy", C_label.astype(int))
        #tempar=np.array([inputar])
        D=np.vstack([D,inputar])
        #D=D.reshape(int(length/),int(length/2))


'''  
if M is not None:
    M, C = kmedoids.kMedoids(D, numberofclusters)
    if M is not None:
        M, C = kmedoids.kMedoids(D, numberofclusters)
        if M is not None:
            raise Exception('too many medoids (after removing duplicate points)')
'''
#file=open("output.txt","w")

M, C = kmedoids.kMedoids(D, numberofclusters)
#print(C)
print('medoids:')
for point_idx in M:
    outputar=np.concatenate((outputar,data[point_idx]),axis=0)
np.savetxt('medoid.txt',M,fmt="%s")
np.savetxt('output.txt',outputar,fmt="%s",delimiter=',')    
print('')
print('clustering result:',M)
#np.savetxt('clusteringoutput.txt',C)
cluster=[None] * int(length/2)
for label in C:
    for point_idx in C[label]:
        cluster[point_idx]=M[label]
print(cluster)
np.savetxt('clusteringoutput.txt',cluster,fmt="%s")       
Exemplo n.º 34
0
def pairwiseClustering(df1, df2):
    clean_lyrics = getCleanLyrics(df1, df2)
    vec = TfidfVectorizer(analyzer='word',
                          ngram_range=(1, 2),
                          min_df=0,
                          stop_words='english',
                          max_features=5000)
    tfidf_matrix = vec.fit_transform(clean_lyrics)
    feature_names = vec.get_feature_names()
    n1 = len(df1['lyrics'])
    tfidf_vectors = tfidf_matrix.toarray()

    n = len(clean_lyrics)
    distances = [[0 for x in range(n)] for y in range(n)]

    d_file = open('distances_bigram.txt', 'a+')

    for i in range(n):
        for j in range(n):
            distances[i][j] = 10 * round(
                np.linalg.norm(tfidf_vectors[i] - tfidf_vectors[j]), 5)
            d_file.write(str(distances[i][j]))
            if (j != n - 1):
                d_file.write(',')
            else:
                d_file.write('\n')

    d_file.close()

    maxx = 0
    minx = 10000
    count = 0
    sum = 0
    for i in range(n):
        for j in range(n):
            if distances[i][j] != 0:
                sum += distances[i][j]
                count += 1
                if (distances[i][j] > maxx):
                    maxx = distances[i][j]
                if (distances[i][j] < minx):
                    minx = distances[i][j]

    import kmedoids

    A = np.matrix(distances)

    n = len(A)

    def cost(d_mat, M, C):
        k = len(M)
        costs = []
        for i in range(k):
            costs.append(0)
        for c_i in range(k):
            for i in C[c_i]:
                costs[c_i] += d_mat[M[c_i], i]

        return np.sum(costs)

    M, C = kmedoids.kMedoids(A, n, 2)

    for i in range(100):
        t_M, t_C = kmedoids.kMedoids(A, n, 2)
        if (cost(A, t_M, t_C) < cost(A, M, C)):
            M = t_M
            C = t_C

    print "Pair || " + str(df1['year'].iloc[0]) + ": " + str(len(
        df1['year'])) + ", " + str(df2['year'].iloc[0]) + ": " + str(
            len(df2['year']))
    print "===================================================="
    count1 = 0
    count2 = 0
    print "Cluster 1 : " + str(len(C[0]))
    for point in C[0]:
        if point < n1:
            count1 += 1
        else:
            count2 += 1
    if count1 > count2:
        c_1 = count1
    else:
        c_2 = count2

    print str(df1['year'].iloc[0]) + ": " + str(count1) + ", " + str(
        df2['year'].iloc[0]) + ": " + str(count2)
    count1 = 0
    count2 = 0
    print "Cluster 2 : " + str(len(C[1]))
    for point in C[1]:
        if point < n1:
            count1 += 1
        else:
            count2 += 1
    if count1 > count2:
        c_1 = count1
    else:
        c_2 = count2
    print str(df1['year'].iloc[0]) + ": " + str(count1) + ", " + str(
        df2['year'].iloc[0]) + ": " + str(count2)

    accuracy = (c_1 + c_2) * 1.0 / n

    print "\nAccuracy: " + str(
        accuracy) + "\n\n===================================================="
    print "\n"

    return accuracy
Exemplo n.º 35
0

# split into 2 clusters
def cost(d_mat, M, C):
    k = len(M)
    costs = []
    for i in range(k):
        costs.append(0)
    for c_i in range(k):
        for i in C[c_i]:
            costs[c_i] += d_mat[M[c_i], i]

    return np.sum(costs)


M, C = kmedoids.kMedoids(D, n, 2)

for i in range(1000):
    t_M, t_C = kmedoids.kMedoids(D, n, 2)
    if (cost(D, t_M, t_C) < cost(D, M, C)):
        M = t_M
        C = t_C

print('medoids:')
for point_idx in M:
    # print( data[point_idx] )
    print point_idx

# print('')

print('clustering result:')