def affinity_propagation(crime_rows, column_names):
    """
        damping : float, optional, default: 0.5
            Damping factor between 0.5 and 1.
        convergence_iter : int, optional, default: 15
            Number of iterations with no change in the number of estimated 
            clusters that stops the convergence.
        max_iter : int, optional, default: 200
            Maximum number of iterations.
        preference : array-like, shape (n_samples,) or float, optional
            Preferences for each point - points with larger values of preferences 
            are more likely to be chosen as exemplars. 
            The number of exemplars, ie of clusters, is influenced by the input 
            preferences value. If the preferences are not passed as arguments, 
            they will be set to the median of the input similarities.
        affinity : string, optional, default=``euclidean``
            Which affinity to use. At the moment precomputed and euclidean are 
            supported. euclidean uses the negative squared euclidean distance 
            between points.
    """
    crime_xy = [crime[0:2] for crime in crime_rows]
    crime_info = [crime[2:] for crime in crime_rows]
    print("Running Affinity Propagation")
    # TODO: Parameterize this
    affinity_prop = AffinityPropagation()
    #affinity_propagation_labels = affinity_prop.fit_predict(crime_xy)
    affinity_prop.fit(random_sampling(crime_xy, num_samples=5000))
    affinity_propagation_labels = affinity_prop.predict(crime_xy)
    print("formatting....")
    return _format_clustering(affinity_propagation_labels, crime_xy, crime_info, 
            column_names)
Пример #2
0
def loadKmeansData(dataArrayTest,dataArrayTrain,k,m='load'):
    if m=='load':
        centroidRead=open('centroid','r')
        labelClusterRead=open('labelCluster','r')
        labelPreRead=open('labelPre','r')
        centroid=pickle.load(centroidRead)
        labelCluster=pickle.load(labelClusterRead)
        labelPre=pickle.load(labelPreRead)
    else:
        dataArrayTestNorm = preprocessing.normalize(dataArrayTest)
        dataArrayTrainNorm = preprocessing.normalize(dataArrayTrain)
        #clf=MiniBatchKMeans(init='k-means++', n_clusters=k, n_init=10)
        clf=AffinityPropagation()
        #clf=DBSCAN(min_samples=30)
        pre=clf.fit(dataArrayTrainNorm)


        centroid=pre.cluster_centers_
        centroidWrite=open('centroid','w')
        #pickle.dump(centroid,centroidWrite)

        labelCluster=pre.labels_
        labelClusterWrite=open('labelCluster','w')
        #pickle.dump(labelCluster,labelClusterWrite)

        labelPre=clf.predict(dataArrayTestNorm)
        labelPreWrite=open('labelPre','w')
        #pickle.dump(labelPre,labelPreWrite)

    return centroid,labelCluster,labelPre
Пример #3
0
def ap():
    data_index = [
        "dp_tr",
        "dp_t",
        "dq_tr",
        "dq_t",
        "du_tr",
        "du_t",
        "di_tr",
        "di_t",
        "dp_s",
        "dq_s",
        "du_s",
        "di_s",
        "dp_dq",
        "first_h",
        "third_h",
        "fifth_h",
    ]
    feature = readFeature(db)
    sample, n_com, pca_fit = do_pca()
    sample = pd.DataFrame(sample)
    sample["p_n"] = feature["p_n"].values

    sample_up = sample[sample.p_n == 1].iloc[:, 0:n_com]
    sample_down = sample[sample.p_n == 0].iloc[:, 0:n_com]

    start = time.time()
    print("start to do training")
    p = -0.5
    af_up = AffinityPropagation(damping=0.5, preference=p).fit(sample_up)
    af_down = AffinityPropagation(damping=0.5, preference=p).fit(sample_down)
    print("Event number of starting appliances:", af_up.predict(sample_up))
    print("Event number of stoping appliances:", af_down.predict(sample_down))
    # feature['labels'] = af.labels_
    saveModel2mdb(
        db,
        [
            af_up,
            af_down,
            pca_fit,
            feature.loc[:, data_index].max(),
            feature.loc[:, data_index].min(),
            feature.loc[:, data_index].mean(),
        ],
    )
    print("done with training take:", time.time() - start, "seconds")
Пример #4
0
def clustering_affinity_propagation(data_res):
    """
    Executes sklearn's affinity propagation function with the given data frame
    """
    af = AffinityPropagation()
    af.fit(data_res)

    predictions = af.predict(data_res)
    cluster_centers = af.cluster_centers_

    return predictions, cluster_centers, af
Пример #5
0
med_def_distance(player_list)

# make the features array

features_array = np.column_stack(
    (dribbles_list, def_dist_list, shot_dist_list))

norm_features = normalize(features_array, axis=0)

# create the model

model = AffinityPropagation()

model.fit(norm_features)

labels = model.predict(norm_features)

# check to make sure every player has a cluster
# combine player's with their clusters
print(player_list.shape)
print(labels.shape)

clusters = zip(player_list, labels)
clusters = list(clusters)

clusters = np.array(clusters)

sorted_clusters = clusters[np.argsort(clusters[:, 1])]
# print the full list of players and their respective clusters

print(sorted_clusters)
mbk = MiniBatchKMeans(n_clusters=35)
mbk.fit(y_toPredict)
df['Cluster_From_mbk'] = mbk.predict(y_toPredict)

#AgglomerativeClustering
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=35)
df['Cluster_From_ac'] = ac.fit_predict(y_toPredict)

#Affinity Propagation
from sklearn.cluster import AffinityPropagation

ap = AffinityPropagation(preference=-35)
ap.fit(y_toPredict)
df['Cluster_From_ap'] = ap.predict(y_toPredict)

#Creating lists from different clusters from each clustering algorithm

nameListKmeans = [[None]] * num_cluster
for i in range(0, num_cluster):
    nameListKmeans[i] = list()
for index, row in df.iterrows():
    nameListKmeans[row['Cluster_From_km']].append(row['Job_Title'] + "," +
                                                  row['CompanyName'])

nameListBirch = [[None]] * num_cluster
for i in range(0, num_cluster):
    nameListBirch[i] = list()
for index, row in df.iterrows():
    nameListBirch[row['Cluster_From_birch']].append(row['Job_Title'] + "," +
Пример #7
0
def test_affinity_propagation_predict():
    # Test AffinityPropagation.predict
    af = AffinityPropagation(affinity="euclidean")
    labels = af.fit_predict(X)
    labels2 = af.predict(X)
    assert_array_equal(labels, labels2)
Пример #8
0


print("Calculate AP codebook for quantization  ...")
af = AffinityPropagation().fit(resnet50_train_overall)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
AP_codebook_size = len(cluster_centers_indices) 

bovw_matrix_train=np.zeros((raw_matrix_train.shape[0],raw_matrix_train.shape[1]))

for i in xrange(bovw_matrix_train.shape[0]):
    for j in xrange(bovw_matrix_train.shape[1]):
        current_frame_rawfeature=raw_matrix_train[i,j]
        current_frame_w=kmeans_codebook.predict(current_frame_rawfeature.reshape(1,-1))[0]
        current_frame_w=af.predict(current_frame_rawfeature.reshape(1,-1))[0]
        bovw_matrix_train[i,j]=int(current_frame_w)


bovw_matrix_test=np.zeros((raw_matrix_test.shape[0],raw_matrix_test.shape[1]))

for i in xrange(bovw_matrix_test.shape[0]):
    for j in xrange(bovw_matrix_test.shape[1]):
        current_frame_rawfeature=raw_matrix_test[i,j]
        current_frame_w=kmeans_codebook.predict(current_frame_rawfeature.reshape(1,-1))[0]
        current_frame_w=af.predict(current_frame_rawfeature.reshape(1,-1))[0]
        bovw_matrix_test[i,j]=int(current_frame_w)

 
 
 
Пример #9
0
                   0.32486058, 0.59739425, 0.24806902, 0.98009566, 0.44359849,
                   0.50656873, 0.65229741, 0.62491293, 0.48583292, 0.91480856,
                   0.22179706, 0.49348, 0.52367377, 0.73338162, 0.63712822,
                   0.39172376, 0.8466613, 0.84700926, 0.72830164, 1.
               ]])

y = [
    'INTJ', 'Jesus', 'ice', 'ENFP', 'Reagan', 'INTP', 'Badass', 'Charlie',
    'Dennis', 'Mac', 'Cherry', 'ISTJ', 'God', 'Guns', 'Vanilla', 'Fire',
    'seventeen', 'Frank', 'Dee', 'Snail'
]

#print(X)

af = AffinityPropagation(preference=0).fit(X)
print(af.predict(X))
labels = af.labels_
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, labels))

kmeans = KMeans(n_clusters=5, init='k-means++')
kmeans.fit(X, y)
print(kmeans.predict(X))

print(y)

print(kmeans.score(X))

#fpc = fuzz.cluster.cmeans( X, 3, 2, error=0.005, maxiter=1000, init=None)

# Store fpc values for later
#print(fpc)
Пример #10
0
def doAffinity(X):
    model = AffinityPropagation(damping = 0.5, max_iter = 2, affinity = 'euclidean')
    model.fit(X)
    clust_labels2 = model.predict(X)
    cent2 = model.cluster_centers_
    return (clust_labels2, cent2)
Пример #11
0
# affinity propagation clustering
from numpy import unique
from numpy import where
from sklearn.datasets import make_classification
from sklearn.cluster import AffinityPropagation
from matplotlib import pyplot
# define dataset
X, _ = make_classification(n_samples=1000,
                           n_features=2,
                           n_informative=2,
                           n_redundant=0,
                           n_clusters_per_class=1,
                           random_state=4)
# define the model
model = AffinityPropagation(damping=0.9)
# fit the model
model.fit(X)
# assign a cluster to each example
yhat = model.predict(X)
# retrieve unique clusters
clusters = unique(yhat)
# create scatter plot for samples from each cluster
for cluster in clusters:
    # get row indexes for samples with this cluster
    row_ix = where(yhat == cluster)
    # create scatter of these samples
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# show the plot
pyplot.show()
# img_in = Image.open('test.jpg')
img_in = Image.open(sys.argv[1])

img_in = np.array(img_in, dtype=np.float64) / 255

w, h, d = original_shape = tuple(img_in.shape)
assert d == 3
image_array = np.reshape(img_in, (w * h, d))

print("Fitting model on a small sub-sample of the data")
image_array_sample = shuffle(image_array, random_state=0)[:500]
print("Predicting color indices on the full image (affinity propagation)")
t0 = time()
af_prop = AffinityPropagation(max_iter=200, damping=0.9,
                              convergence_iter=50).fit(image_array_sample)
labels_af = af_prop.predict(image_array)
print("done in %0.3fs." % (time() - t0))


def recreate_image(codebook, labels, w, h):
    """Recreate the (compressed) image from the code book & labels"""
    d = codebook.shape[1]
    image = np.zeros((w, h, d))
    label_idx = 0
    for i in range(w):
        for j in range(h):
            image[i][j] = codebook[labels[label_idx]]
            label_idx += 1
    return image

Пример #13
0
            不需要确定聚类的数量
        缺点:
            复杂度高,O(N^2 * T),T迭代次数,N样本数
        
        适合中小数据集
'''

# 创建实例、训练模型,预测,计算后的质心
cluster = AffinityPropagation(damping=0.5,
                              max_iter=500,
                              convergence_iter=20,
                              copy=True,
                              preference=-50,
                              affinity='euclidean',
                              verbose=False)
cluster.fit(X)
Y_pred = cluster.predict(X)
cluster.cluster_centers_
cluster.cluster_centers_indices_  # 簇中心的指示
# 画图
plt.subplot(221)
plt.scatter(X[:, 0], X[:, 1], c=Y_pred)
'''
    damping                     阻尼系数,在[0.5,1]之间
    max_iter                    最大迭代数
    convergence_iter            停止收敛的估计簇数量没有变化的迭代次数
    copy                        复制输入数据
    preference                  很重要,数据点偏好
    affinity                    如何计算亲和度,比如欧氏距离
    verbose                     不懂
'''
Пример #14
0
    mini_batch_valid_performance_metrics_for_plotting[item + 1] = mini_batch_valid_performance_metric_array[item]
    mini_batch_test_performance_metrics_for_plotting[item + 1] = mini_batch_test_performance_metric_array[item]
Figures.save_valid_test_performance_measures_vs_hyper_parameters_figure(mini_batch_parameter_search_space_for_plotting,
                                                                        mini_batch_valid_performance_metrics_for_plotting,
                                                                        mini_batch_test_performance_metrics_for_plotting,
                                                                        'Adjusted Mutual Information Score',
                                                                        'MiniBatch K-Means Clustering n_init parameter',
                                                                        'Mini_Batch_k-Means_Performance',
                                                                        0,
                                                                        0.5)

# Do AffinityPropagation, optimizing damping over a validation set
current_optimal_affinity_propagation_parameter = 0.5
initial_optimal_affinity_propagation_clusterer = AffinityPropagation(damping=current_optimal_affinity_propagation_parameter)
initial_optimal_affinity_propagation_clusterer.fit(train_data_set)
initial_affinity_propagation_valid_predictions = initial_optimal_affinity_propagation_clusterer.predict(valid_data_set)
initial_affinity_propagation_test_predictions = initial_optimal_affinity_propagation_clusterer.predict(test_data_set)

# Add one to the predictions to make them match up with range of labels, then apply Hungarian Fix
for element in range(number_of_valid_observations):
    initial_affinity_propagation_valid_predictions[element] += 1
for element in range(number_of_test_observations):
    initial_affinity_propagation_test_predictions[element] += 1
initial_affinity_propagation_valid_predictions = Clustering.Hungarian_Fix(initial_affinity_propagation_valid_predictions,
                                                                          valid_labels).astype('int')
initial_affinity_propagation_test_predictions = Clustering.Hungarian_Fix(initial_affinity_propagation_test_predictions,
                                                                         test_labels).astype('int')

# Set a starting point for optimality of the initial performance metric, to be possibly adjusted later
affinity_propagation_parameter_integer_search_space_start = current_optimal_affinity_propagation_parameter + 0.05
affinity_propagation_parameter_integer_search_space_stop = current_optimal_affinity_propagation_parameter + 0.45
                                  lowercase=True, stop_words="english")


print("Vectorizing...")

t0 = time()
samples = dataset.data[:args.n_samples]
counts = vectorizer.fit_transform(samples)
tfidf = text.TfidfTransformer(norm="l2", use_idf=True).fit_transform(counts)
print("done in %0.3fs." % (time() - t0))

# Fit the model
print("Fitting the model on with n_samples=%d and n_features=%d..."
      % (args.n_samples, args.n_features))

t0 = time()
d = Decomposition()
nmf = d.fit(tfidf)
print("done in %0.3fs." % (time() - t0))


# Fit the model
print("Predicting labels...")

t0 = time()
labels = d.predict(tfidf)
print("done in %0.3fs." % (time() - t0))

for sample, label in izip(samples, labels):
    print(sample, label)
clustering = AffinityPropagation().fit(df)

# In[14]:

AffinityPropagation(affinity='euclidean',
                    convergence_iter=15,
                    copy=True,
                    damping=0.5,
                    max_iter=200,
                    preference=None,
                    verbose=False)

# In[15]:

y_pred = clustering.predict(df)

# In[16]:

plt.scatter(df[" gdp_for_year ($) "], df["suicides_no"], c=y_pred)
plt.title("Clusters")
plt.xlabel("PIB")
plt.ylabel("Número de suicídios")

# In[29]:

# Clustering utilizando el algoritmo aglomerativo

# In[18]:

preds = []
Пример #17
0
model8.fit(x)
print('\nMeanShift:')
print(model8.cluster_centers_)
ypred8 = model8.predict(x)
plt.figure(figsize=(12, 8))
plt.scatter(x[:, 0], x[:, 1], c=ypred8, cmap='jet')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('MeanShift', fontdict=dict(size=20, color='r'))

model9 = SpectralClustering(n_clusters=4, eigen_solver=None, random_state=1, gamma=1, affinity='rbf',
                            n_neighbors=10)
ypred9 = model9.fit_predict(x)
plt.figure(figsize=(12, 8))
plt.scatter(x[:, 0], x[:, 1], c=ypred9, cmap='rainbow')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('SpectralClustering', fontdict=dict(size=20, color='r'))

model10 = AffinityPropagation(damping=.8)
model10.fit(x)
print('\nAffinityPropagation:')
print(model10.cluster_centers_)
ypred10 = model10.predict(x)
plt.figure(figsize=(12, 8))
plt.scatter(x[:, 0], x[:, 1], c=ypred10, cmap='brg')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('AffinityPropagation', fontdict=dict(size=20, color='r'))

plt.show()
Пример #18
0
file = open('./data/glass.data')
next(file)
X = []
y = []
for line in file.readlines():
    curLine = line.strip().split(", ")
    X.append([float(i) for i in curLine[0:-1]])
    y.append(curLine[-1].strip('.'))

# iterate over classifiers-------------------------------------------
glass_score = []
params = range(-90, 0, 5)
for param in params:
    algorithm = AffinityPropagation(preference=param)
    algorithm.fit(X)
    y_pred = algorithm.predict(X)
    s = adjusted_rand_score(y, y_pred)
    glass_score.append(s)
print('glass_score', glass_score)

# draw score pic---------------------------------------
plt.figure(figsize=(6, 4), dpi=120)
plt.grid()
plt.xlabel('preference for AP')
plt.xticks(params)
plt.plot(params, glass_score, label='glass_score', color='g')
plt.legend()
plt.title("glass AP score")

plt.savefig("img/AP.png")
Пример #19
0
agglomerative_model = AgglomerativeClustering(n_clusters=2)
birch_model = Birch(threshold=0.03, n_clusters=2)
dbscan_model = DBSCAN(eps=0.25, min_samples=9)
kmeans_model = KMeans(n_clusters=2)
mean_model = MeanShift()
optics_model = OPTICS(eps=0.75, min_samples=10)
gaussian_model = GaussianMixture(n_components=2)

# train the model
affinity_model.fit(training_data)
birch_model.fit(training_data)
kmeans_model.fit(training_data)
gaussian_model.fit(training_data)

# assign each data point to a cluster
affinity_result = affinity_model.predict(training_data)
agglomerative_result = agglomerative_model.fit_predict(training_data)
birch_result = birch_model.predict(training_data)
dbscan_result = dbscan_model.fit_predict(training_data)
kmeans_result = kmeans_model.predict(training_data)
mean_result = mean_model.fit_predict(training_data)
optics_result = optics_model.fit_predict(training_data)
gaussian_result = gaussian_model.predict(training_data)

# get all of the unique clusters
affinity_clusters = unique(affinity_result)
agglomerative_clusters = unique(agglomerative_result)
birch_clusters = unique(birch_result)
dbscan_clusters = unique(dbscan_result)
kmeans_clusters = unique(kmeans_result)
mean_clusters = unique(mean_result)
Пример #20
0
def clusterPlot(data):
    if (NORMALIZE):
        data = preprocessing.normalize(data, norm=NORM)
    # print(vec.get_feature_names())
    #reduced_data = PCA(n_components=2, whiten=True).fit_transform(data)
    reduced_data = umap.UMAP(n_neighbors=15).fit_transform(data)
    print(reduced_data)
    clustering = AffinityPropagation(random_state=0).fit(reduced_data)
    #clustering = KMeans(init="k-means++", n_clusters=4, n_init=2)
    #clustering.fit(reduced_data)
    #clustering = DBSCAN(eps=0.3, min_samples=10).fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = 0.001  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 0.2, reduced_data[:,
                                                                0].max() + 0.2
    y_min, y_max = reduced_data[:, 1].min() - 0.2, reduced_data[:,
                                                                1].max() + 0.2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = clustering.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure(figsize=[30, 20])
    plt.clf()
    plt.imshow(
        Z,
        interpolation="nearest",
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        cmap=plt.cm.Paired,
        aspect="auto",
        origin="lower",
    )

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], "k.", markersize=5)
    # Plot the centroids as a white X
    # centroids = kmeans.cluster_centers_
    # plt.scatter(
    #    centroids[:, 0],
    #    centroids[:, 1],
    #    marker="x",
    #    s=169,
    #    linewidths=3,
    #    color="w",
    #    zorder=10,
    # )
    # plt.title("Clustering on the HITO software products (PCA-reduced data)"#"Centroids are marked with white cross")
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())

    # cursor = mplcursors.cursor(hover=True)
    # cursor.connect("add", lambda sel: sel.annotation.set_text(D[sel.target.index]["uri"]))

    # ax = plt.figure().add_subplot(111,autoscale_on=True)
    texts = []
    for i in range(len(D)):
        a = plt.text(reduced_data[i][0], reduced_data[i][1], E[i]["label"])
        texts.append(a)

    if ADJUST_TEXT:
        adjust_text(texts, lim=10)

    plt.tight_layout()
    #plt.savefig("cluster-"+("classifiedonly-" if CLASSIFIED_ONLY else "")+NORM+".pdf", pad_inches=0)
    plt.savefig("cluster-bagofwords-" +
                ("classifiedonly-" if CLASSIFIED_ONLY else "") + NORM + ".pdf",
                pad_inches=0)
    plt.savefig("cluster.png", pad_inches=0)
    plt.show()
Пример #21
0
from sklearn.datasets import make_moons, make_circles, make_blobs
from sklearn.cluster import AffinityPropagation
import numpy as np

centers = [[0, 1], [-1, -1], [1, -1]]
X, y = make_blobs(n_samples=1500, random_state=170)
trs = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X = np.dot(X, trs)
clt = AffinityPropagation(damping=.9)
clt.fit(X)

import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
#调整图片风格
mpl.style.use('fivethirtyeight')
#定义xy网格,用于绘制等值线图
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                     np.arange(y_min, y_max, 0.1))
#预测可能性
Z = clt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
yp = clt.predict(X)
plt.contourf(xx, yy, Z, alpha=.8)
#绘制散点图
plt.scatter(X[:, 0], X[:, 1], c=yp, edgecolors='k')
plt.axis("equal")
plt.show()
Пример #22
0
data

# In[38]:

from sklearn.cluster import AffinityPropagation
clustering = AffinityPropagation().fit(data)
print(clustering)

# In[ ]:

# In[ ]:

# In[42]:

re = clustering.predict(data)

# In[26]:

le = len(clustering.cluster_centers_)
cluster_centers_indices = clustering.cluster_centers_indices_
labels = clustering.labels_
n_clusters_ = len(cluster_centers_indices)
print(n_clusters_)

# In[27]:

from sklearn.cluster import KMeans

km = KMeans(n_clusters=le, random_state=1)
new = data._get_numeric_data()
Пример #23
0
def main():
    parser = argparse.ArgumentParser(
        description='This code contains the RF model building. ')
    # Required
    req_group = parser.add_argument_group(title='REQUIRED INPUT')
    req_group.add_argument(
        '-df_short_name',
        help=
        'feature matrix, for Set B, use the short name, for Set A, use the full name of the expression matrix',
        required=True)
    req_group.add_argument('-path',
                           help='path to the feature matrix',
                           required=True)
    req_group.add_argument('-save_path',
                           help='path to save the outputs',
                           required=True)
    req_group.add_argument('-clustering_method',
                           help='kmean, affinity, birch, or meanshift',
                           required=True)
    req_group.add_argument('-test_gene_list',
                           help='Genes_for_testing.txt',
                           required=True)
    req_group.add_argument('-train_gene_list',
                           help='Genes_for_training.txt',
                           required=True)
    req_group.add_argument('-dataset', help='setA or setB', required=True)

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    args = parser.parse_args()

    DF = args.df_short_name
    path = args.path
    save_path = args.save_path
    clustering_method = args.clustering_method
    TEST = args.test_gene_list
    TRAIN = args.train_gene_list
    dataset = args.dataset

    with open(TEST) as test_file:
        test = test_file.read().splitlines()

    with open(TRAIN) as training_file:
        training = training_file.read().splitlines()

    if dataset == 'setB':
        df = pd.read_csv(path + DF + '_CV_1_features.txt',
                         sep='\t',
                         index_col=0)
        short_name = DF
    if dataset == 'setA':
        expression = pd.read_csv(path + DF, sep='\t', index_col=0)
        pathway_annotation = pd.read_csv(
            'Sly_pathway_annotation_20190117_with_expression_5_members_nonoverlapping.txt',
            sep='\t',
            index_col=1,
            header=None)
        pathway_annotation.columns = ['Class']
        df = pd.concat([pathway_annotation, expression], axis=1)
        short_name = open(
            '/mnt/home/peipeiw/Documents/Pathway_prediction/20180827_all_EC_pathway/Short_name_for_expression_data.txt',
            'r').readlines()
        D = {}
        for inl in short_name:
            D[inl.split('\t')[0]] = inl.split('\t')[1].strip()
        short_name = D[DF]
    y = df['Class']
    classes = y.unique()
    df_test = df[df.index.isin(test)]
    y_test = df_test['Class']
    X_test = df_test.drop(['Class'], axis=1)
    df_training = df[df.index.isin(training)]
    y_training = df_training['Class']
    X_training = df_training.drop(['Class'], axis=1)
    test_classes = y_test.unique()
    if clustering_method.lower() == 'kmean':
        for n_clusters in [5, 10, 25, 50, 85, 100, 200, 300, 400, 500]:
            accuracies = []
            accuracies_ho = []
            f1_array = np.array(
                [np.insert(arr=classes.astype(np.str), obj=0, values='M')])
            f1_array_ho = np.array([
                np.insert(arr=test_classes.astype(np.str), obj=0, values='M')
            ])
            for cv_number in range(1, 6):
                if dataset == 'setB':
                    df = pd.read_csv(path + DF +
                                     '_CV_%s_features.txt' % cv_number,
                                     sep='\t',
                                     index_col=0)
                with open('Genes_for_5_training_set%s.txt' %
                          cv_number) as train_file:
                    train = train_file.read().splitlines()
                with open('Genes_for_5_validation_set%s.txt' %
                          cv_number) as validation_file:
                    validation = validation_file.read().splitlines()
                df_train = df[df.index.isin(train)]
                df_validation = df[df.index.isin(validation)]
                X_train = df_train.drop(['Class'], axis=1)
                X_validation = df_validation.drop(['Class'], axis=1)
                y_train = df_train['Class']
                y_validation = df_validation['Class']
                mat = X_train.as_matrix()  # Convert DataFrame to matrix
                mat_validation = X_validation.as_matrix()
                mat_test = X_test.as_matrix()
                clu = sklearn.cluster.KMeans(n_clusters=n_clusters,
                                             n_init=3,
                                             n_jobs=5,
                                             max_iter=500)  # Using sklearn
                clu.fit(mat)
                train_labels = clu.labels_  # Get cluster assignment labels
                train_tem = pd.DataFrame([train_labels
                                          ]).T  # Format results as a DataFrame
                train_tem.index = X_train.index
                train_tem.columns = ['Cluster']
                train_res = pd.concat([y_train, train_tem], axis=1)
                E_C_P = Enrichment_clustering(train_res, n_clusters)

                joblib.dump(
                    clu, save_path + short_name + "_Kmeans_%s_%s_%s.pkl" %
                    (dataset, cv_number, n_clusters))

                cv_labels = clu.predict(mat_validation)
                cv_tem = pd.DataFrame([cv_labels]).T
                cv_tem.index = X_validation.index
                cv_tem.columns = ['Cluster']
                cv_res = pd.concat([y_validation, cv_tem], axis=1)
                for i in range(0, cv_res.shape[0]):
                    try:
                        cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]]
                    except:
                        cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              cv_res.iloc[i, 1])
                if cv_number == 1:
                    predicted = cv_res.copy()
                else:
                    predicted = pd.concat([predicted, cv_res], axis=0)
                result = Performance_MC(cv_res.Class, cv_res.Cluster, classes)
                if 'accuracy' in result:
                    accuracies.append(result['accuracy'])
                if 'macro_f1' in result:
                    f1_temp_array = np.insert(arr=result['f1_MC'],
                                              obj=0,
                                              values=result['macro_f1'])
                    f1_array = np.append(f1_array, [f1_temp_array], axis=0)

                test_labels = clu.predict(mat_test)
                test_tem = pd.DataFrame([test_labels]).T
                test_tem.index = X_test.index
                test_tem.columns = ['Cluster']
                test_res = pd.concat([y_test, test_tem], axis=1)
                for i in range(0, test_res.shape[0]):
                    try:
                        test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]]
                    except:
                        test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              test_res.iloc[i, 1])
                if cv_number == 1:
                    predicted_test = test_res.copy()
                else:
                    predicted_test = pd.concat(
                        [predicted_test, test_res.Cluster], axis=1)
                ho_result = Performance_MC(test_res.Class, test_res.Cluster,
                                           test_classes)
                if 'accuracy' in ho_result:
                    accuracies_ho.append(ho_result['accuracy'])
                if 'macro_f1' in ho_result:
                    ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'],
                                                 obj=0,
                                                 values=ho_result['macro_f1'])
                    f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array],
                                            axis=0)

            f1 = pd.DataFrame(f1_array)
            f1.columns = f1.iloc[0]
            f1 = f1[1:]
            f1.columns = [str(col) + '_F1' for col in f1.columns]
            f1 = f1.astype(float)

            # Calculate accuracy and f1 stats
            AC = np.mean(accuracies)
            AC_std = np.std(accuracies)
            MacF1 = f1['M_F1'].mean()
            MacF1_std = f1['M_F1'].std()

            print('Save the predicted values:')
            predicted.to_csv(save_path + short_name +
                             "_Kmean_%s_%s_validation_prediction.txt" %
                             (dataset, n_clusters),
                             index=True,
                             header=True,
                             sep="\t")
            predicted_test.to_csv(save_path + short_name +
                                  "_Kmean_%s_%s_test_prediction.txt" %
                                  (dataset, n_clusters),
                                  index=True,
                                  header=True,
                                  sep="\t")

            print(
                "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC, AC_std, MacF1, MacF1_std))

            # Unpack results for test
            f1_ho = pd.DataFrame(f1_array_ho)
            f1_ho.columns = f1_ho.iloc[0]
            f1_ho = f1_ho[1:]
            f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns]
            f1_ho = f1_ho.astype(float)
            AC_ho = np.mean(accuracies_ho)
            AC_std_ho = np.std(accuracies_ho)
            MacF1_ho = f1_ho['M_F1'].mean()
            MacF1_std_ho = f1_ho['M_F1'].std()
            print(
                "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))

            # Save detailed results file
            n_features = df.shape[1] - 1
            if clustering_method.lower() == 'kmean':
                out = open(
                    save_path + short_name + "_Kmean_%s_%s_results.txt" %
                    (dataset, n_clusters), 'w')
            if clustering_method.lower() == 'affinity':
                out = open(
                    save_path + short_name +
                    "_AffinityPropagation_%s_%s_%s_results.txt" %
                    (dataset, damping, n_clusters), 'w')

            out.write('\n\nResults for prediction on validation set:\n')
            out.write(
                'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n'
                % (AC, AC_std, MacF1, MacF1_std))
            for cla in f1.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1[cla]), np.std(f1[cla])))

            # Add results for test
            out.write('\n\nResults for the test set:\n')
            out.write(
                'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' %
                (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))
            for cla in f1_ho.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla])))

            out.close()

    if clustering_method.lower() == 'affinity':
        for damping in [0.5, 0.6, 0.7, 0.8, 0.9, 0.99]:
            accuracies = []
            accuracies_ho = []
            f1_array = np.array(
                [np.insert(arr=classes.astype(np.str), obj=0, values='M')])
            accuracies_ho = []
            f1_array_ho = np.array([
                np.insert(arr=test_classes.astype(np.str), obj=0, values='M')
            ])
            for cv_number in range(1, 6):
                if dataset == 'setB':
                    df = pd.read_csv(path + DF +
                                     '_CV_%s_features.txt' % cv_number,
                                     sep='\t',
                                     index_col=0)
                with open('Genes_for_5_training_set%s.txt' %
                          cv_number) as train_file:
                    train = train_file.read().splitlines()
                with open('Genes_for_5_validation_set%s.txt' %
                          cv_number) as validation_file:
                    validation = validation_file.read().splitlines()
                df_train = df[df.index.isin(train)]
                df_validation = df[df.index.isin(validation)]
                X_train = df_train.drop(['Class'], axis=1)
                X_validation = df_validation.drop(['Class'], axis=1)
                y_train = df_train['Class']
                y_validation = df_validation['Class']
                mat = X_train.as_matrix()  # Convert DataFrame to matrix
                mat_validation = X_validation.as_matrix()
                mat_test = X_test.as_matrix()
                clu = AffinityPropagation(damping=damping)
                clu.fit(mat)
                train_labels = clu.labels_  # Get cluster assignment labels
                n_clusters = len(np.unique(train_labels))
                train_tem = pd.DataFrame([train_labels
                                          ]).T  # Format results as a DataFrame
                train_tem.index = X_train.index
                train_tem.columns = ['Cluster']
                train_res = pd.concat([y_train, train_tem], axis=1)
                E_C_P = Enrichment_clustering(train_res, n_clusters)
                joblib.dump(
                    clu, save_path + short_name +
                    "_AffinityPropagation_%s_%s_%s.pkl" %
                    (dataset, cv_number, damping))

                cv_labels = clu.predict(mat_validation)
                cv_tem = pd.DataFrame([cv_labels]).T
                cv_tem.index = X_validation.index
                cv_tem.columns = ['Cluster']
                cv_res = pd.concat([y_validation, cv_tem], axis=1)
                for i in range(0, cv_res.shape[0]):
                    try:
                        cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]]
                    except:
                        cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              cv_res.iloc[i, 1])
                if cv_number == 1:
                    predicted = cv_res.copy()
                else:
                    predicted = pd.concat([predicted, cv_res], axis=0)
                result = Performance_MC(cv_res.Class, cv_res.Cluster, classes)
                if 'accuracy' in result:
                    accuracies.append(result['accuracy'])
                if 'macro_f1' in result:
                    f1_temp_array = np.insert(arr=result['f1_MC'],
                                              obj=0,
                                              values=result['macro_f1'])
                    f1_array = np.append(f1_array, [f1_temp_array], axis=0)

                test_labels = clu.predict(mat_test)
                test_tem = pd.DataFrame([test_labels]).T
                test_tem.index = X_test.index
                test_tem.columns = ['Cluster']
                test_res = pd.concat([y_test, test_tem], axis=1)
                for i in range(0, test_res.shape[0]):
                    try:
                        test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]]
                    except:
                        test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              test_res.iloc[i, 1])
                if cv_number == 1:
                    predicted_test = test_res.copy()
                else:
                    predicted_test = pd.concat(
                        [predicted_test, test_res.Cluster], axis=1)
                ho_result = Performance_MC(test_res.Class, test_res.Cluster,
                                           test_classes)
                if 'accuracy' in ho_result:
                    accuracies_ho.append(ho_result['accuracy'])
                if 'macro_f1' in ho_result:
                    ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'],
                                                 obj=0,
                                                 values=ho_result['macro_f1'])
                    f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array],
                                            axis=0)

            f1 = pd.DataFrame(f1_array)
            f1.columns = f1.iloc[0]
            f1 = f1[1:]
            f1.columns = [str(col) + '_F1' for col in f1.columns]
            f1 = f1.astype(float)

            # Calculate accuracy and f1 stats
            AC = np.mean(accuracies)
            AC_std = np.std(accuracies)
            MacF1 = f1['M_F1'].mean()
            MacF1_std = f1['M_F1'].std()

            print('Save the predicted values:')
            predicted.to_csv(
                save_path + short_name +
                "_AffinityPropagation_%s_%s_%s_validation_prediction.txt" %
                (dataset, damping, n_clusters),
                index=True,
                header=True,
                sep="\t")
            predicted_test.to_csv(
                save_path + short_name +
                "_AffinityPropagation_%s_%s_%s_test_prediction.txt" %
                (dataset, damping, n_clusters),
                index=True,
                header=True,
                sep="\t")

            print(
                "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC, AC_std, MacF1, MacF1_std))

            # Unpack results for test
            f1_ho = pd.DataFrame(f1_array_ho)
            f1_ho.columns = f1_ho.iloc[0]
            f1_ho = f1_ho[1:]
            f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns]
            f1_ho = f1_ho.astype(float)
            AC_ho = np.mean(accuracies_ho)
            AC_std_ho = np.std(accuracies_ho)
            MacF1_ho = f1_ho['M_F1'].mean()
            MacF1_std_ho = f1_ho['M_F1'].std()
            print(
                "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))

            # Save detailed results file
            n_features = df.shape[1] - 1
            if clustering_method.lower() == 'kmean':
                out = open(
                    save_path + short_name + "_Kmean_%s_%s_results.txt" %
                    (dataset, n_clusters), 'w')
            if clustering_method.lower() == 'affinity':
                out = open(
                    save_path + short_name +
                    "_AffinityPropagation_%s_%s_%s_results.txt" %
                    (dataset, damping, n_clusters), 'w')

            out.write('\n\nResults for prediction on validation set:\n')
            out.write(
                'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n'
                % (AC, AC_std, MacF1, MacF1_std))
            for cla in f1.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1[cla]), np.std(f1[cla])))

            # Add results for test
            out.write('\n\nResults for test set:\n')
            out.write(
                'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' %
                (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))
            for cla in f1_ho.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla])))

            out.close()

    if clustering_method.lower() == 'birch':
        for n_clusters in [5, 10, 25, 50, 85, 100, 200, 300, 400, 500]:
            accuracies = []
            accuracies_ho = []
            f1_array = np.array(
                [np.insert(arr=classes.astype(np.str), obj=0, values='M')])
            accuracies_ho = []
            f1_array_ho = np.array([
                np.insert(arr=test_classes.astype(np.str), obj=0, values='M')
            ])
            for cv_number in range(1, 6):
                if dataset == 'setB':
                    df = pd.read_csv(path + DF +
                                     '_CV_%s_features.txt' % cv_number,
                                     sep='\t',
                                     index_col=0)
                with open('Genes_for_5_training_set%s.txt' %
                          cv_number) as train_file:
                    train = train_file.read().splitlines()
                with open('Genes_for_5_validation_set%s.txt' %
                          cv_number) as validation_file:
                    validation = validation_file.read().splitlines()
                df_train = df[df.index.isin(train)]
                df_validation = df[df.index.isin(validation)]
                X_train = df_train.drop(['Class'], axis=1)
                X_validation = df_validation.drop(['Class'], axis=1)
                y_train = df_train['Class']
                y_validation = df_validation['Class']
                mat = X_train.as_matrix()  # Convert DataFrame to matrix
                mat_validation = X_validation.as_matrix()
                mat_test = X_test.as_matrix()
                clu = Birch(n_clusters=n_clusters)
                clu.fit(mat)
                train_labels = clu.labels_  # Get cluster assignment labels
                n_clusters = len(np.unique(train_labels))
                train_tem = pd.DataFrame([train_labels
                                          ]).T  # Format results as a DataFrame
                train_tem.index = X_train.index
                train_tem.columns = ['Cluster']
                train_res = pd.concat([y_train, train_tem], axis=1)
                E_C_P = Enrichment_clustering(train_res, n_clusters)
                joblib.dump(
                    clu, save_path + short_name + "_Birch_%s_%s_%s.pkl" %
                    (dataset, cv_number, n_clusters))

                cv_labels = clu.predict(mat_validation)
                cv_tem = pd.DataFrame([cv_labels]).T
                cv_tem.index = X_validation.index
                cv_tem.columns = ['Cluster']
                cv_res = pd.concat([y_validation, cv_tem], axis=1)
                for i in range(0, cv_res.shape[0]):
                    try:
                        cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]]
                    except:
                        cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              cv_res.iloc[i, 1])
                if cv_number == 1:
                    predicted = cv_res.copy()
                else:
                    predicted = pd.concat([predicted, cv_res], axis=0)
                result = Performance_MC(cv_res.Class, cv_res.Cluster, classes)
                if 'accuracy' in result:
                    accuracies.append(result['accuracy'])
                if 'macro_f1' in result:
                    f1_temp_array = np.insert(arr=result['f1_MC'],
                                              obj=0,
                                              values=result['macro_f1'])
                    f1_array = np.append(f1_array, [f1_temp_array], axis=0)

                test_labels = clu.predict(mat_test)
                test_tem = pd.DataFrame([test_labels]).T
                test_tem.index = X_test.index
                test_tem.columns = ['Cluster']
                test_res = pd.concat([y_test, test_tem], axis=1)
                for i in range(0, test_res.shape[0]):
                    try:
                        test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]]
                    except:
                        test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              test_res.iloc[i, 1])
                if cv_number == 1:
                    predicted_test = test_res.copy()
                else:
                    predicted_test = pd.concat(
                        [predicted_test, test_res.Cluster], axis=1)
                ho_result = Performance_MC(test_res.Class, test_res.Cluster,
                                           test_classes)
                if 'accuracy' in ho_result:
                    accuracies_ho.append(ho_result['accuracy'])
                if 'macro_f1' in ho_result:
                    ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'],
                                                 obj=0,
                                                 values=ho_result['macro_f1'])
                    f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array],
                                            axis=0)

            f1 = pd.DataFrame(f1_array)
            f1.columns = f1.iloc[0]
            f1 = f1[1:]
            f1.columns = [str(col) + '_F1' for col in f1.columns]
            f1 = f1.astype(float)

            # Calculate accuracy and f1 stats
            AC = np.mean(accuracies)
            AC_std = np.std(accuracies)
            MacF1 = f1['M_F1'].mean()
            MacF1_std = f1['M_F1'].std()

            print('Save the predicted values:')
            predicted.to_csv(save_path + short_name +
                             "_Birch_%s_%s_validation_prediction.txt" %
                             (dataset, n_clusters),
                             index=True,
                             header=True,
                             sep="\t")
            predicted_test.to_csv(save_path + short_name +
                                  "_Birch_%s_%s_test_prediction.txt" %
                                  (dataset, n_clusters),
                                  index=True,
                                  header=True,
                                  sep="\t")
            print(
                "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC, AC_std, MacF1, MacF1_std))

            # Unpack results for test
            f1_ho = pd.DataFrame(f1_array_ho)
            f1_ho.columns = f1_ho.iloc[0]
            f1_ho = f1_ho[1:]
            f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns]
            f1_ho = f1_ho.astype(float)
            AC_ho = np.mean(accuracies_ho)
            AC_std_ho = np.std(accuracies_ho)
            MacF1_ho = f1_ho['M_F1'].mean()
            MacF1_std_ho = f1_ho['M_F1'].std()
            print(
                "\nCluster Results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))

            # Save detailed results file
            n_features = df.shape[1] - 1
            out = open(
                save_path + short_name + "_Birch_%s_%s_results.txt" %
                (dataset, n_clusters), 'w')

            out.write('\n\nResults for prediction on validation set:\n')
            out.write(
                'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n'
                % (AC, AC_std, MacF1, MacF1_std))
            for cla in f1.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1[cla]), np.std(f1[cla])))

            # Add results for test
            out.write('\n\nResults for test set:\n')
            out.write(
                'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' %
                (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))
            for cla in f1_ho.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla])))

            out.close()

    if clustering_method.lower() == 'meanshift':
        for bandwidth in [0.01, 0.05, 0.1, 0.5, 1]:
            accuracies = []
            accuracies_ho = []
            f1_array = np.array(
                [np.insert(arr=classes.astype(np.str), obj=0, values='M')])
            accuracies_ho = []
            f1_array_ho = np.array([
                np.insert(arr=test_classes.astype(np.str), obj=0, values='M')
            ])
            for cv_number in range(1, 6):
                if dataset == 'setB':
                    df = pd.read_csv(path + DF +
                                     '_CV_%s_features.txt' % cv_number,
                                     sep='\t',
                                     index_col=0)
                with open('Genes_for_5_training_set%s.txt' %
                          cv_number) as train_file:
                    train = train_file.read().splitlines()
                with open('Genes_for_5_validation_set%s.txt' %
                          cv_number) as validation_file:
                    validation = validation_file.read().splitlines()
                df_train = df[df.index.isin(train)]
                df_validation = df[df.index.isin(validation)]
                X_train = df_train.drop(['Class'], axis=1)
                X_validation = df_validation.drop(['Class'], axis=1)
                y_train = df_train['Class']
                y_validation = df_validation['Class']
                mat = X_train.as_matrix()  # Convert DataFrame to matrix
                mat_validation = X_validation.as_matrix()
                mat_test = X_test.as_matrix()
                clu = MeanShift(
                    bandwidth=bandwidth, cluster_all=True
                )  # cluster_all=True forces the assignment of all instance. if cluster_all=False, orphans are given cluster label -1
                clu.fit(mat)
                train_labels = clu.labels_  # Get cluster assignment labels
                n_clusters = len(np.unique(train_labels))
                train_tem = pd.DataFrame([train_labels
                                          ]).T  # Format results as a DataFrame
                train_tem.index = X_train.index
                train_tem.columns = ['Cluster']
                train_res = pd.concat([y_train, train_tem], axis=1)
                E_C_P = Enrichment_clustering(train_res, n_clusters)
                joblib.dump(
                    clu, save_path + short_name + "_MeanShift_%s_%s_%s.pkl" %
                    (dataset, cv_number, bandwidth))

                cv_labels = clu.predict(mat_validation)
                cv_tem = pd.DataFrame([cv_labels]).T
                cv_tem.index = X_validation.index
                cv_tem.columns = ['Cluster']
                cv_res = pd.concat([y_validation, cv_tem], axis=1)
                for i in range(0, cv_res.shape[0]):
                    try:
                        cv_res.iloc[i, 1] = E_C_P[cv_res.iloc[i, 1]]
                    except:
                        cv_res.iloc[i, 1] = '%s' % cv_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              cv_res.iloc[i, 1])
                if cv_number == 1:
                    predicted = cv_res.copy()
                else:
                    predicted = pd.concat([predicted, cv_res], axis=0)
                result = Performance_MC(cv_res.Class, cv_res.Cluster, classes)
                if 'accuracy' in result:
                    accuracies.append(result['accuracy'])
                if 'macro_f1' in result:
                    f1_temp_array = np.insert(arr=result['f1_MC'],
                                              obj=0,
                                              values=result['macro_f1'])
                    f1_array = np.append(f1_array, [f1_temp_array], axis=0)

                test_labels = clu.predict(mat_test)
                test_tem = pd.DataFrame([test_labels]).T
                test_tem.index = X_test.index
                test_tem.columns = ['Cluster']
                test_res = pd.concat([y_test, test_tem], axis=1)
                for i in range(0, test_res.shape[0]):
                    try:
                        test_res.iloc[i, 1] = E_C_P[test_res.iloc[i, 1]]
                    except:
                        test_res.iloc[i, 1] = '%s' % test_res.iloc[i, 1]
                        print('%s was not enriched for any pathway' %
                              test_res.iloc[i, 1])
                if cv_number == 1:
                    predicted_test = test_res.copy()
                else:
                    predicted_test = pd.concat(
                        [predicted_test, test_res.Cluster], axis=1)
                ho_result = Performance_MC(test_res.Class, test_res.Cluster,
                                           test_classes)
                if 'accuracy' in ho_result:
                    accuracies_ho.append(ho_result['accuracy'])
                if 'macro_f1' in ho_result:
                    ho_f1_temp_array = np.insert(arr=ho_result['f1_MC'],
                                                 obj=0,
                                                 values=ho_result['macro_f1'])
                    f1_array_ho = np.append(f1_array_ho, [ho_f1_temp_array],
                                            axis=0)

            f1 = pd.DataFrame(f1_array)
            f1.columns = f1.iloc[0]
            f1 = f1[1:]
            f1.columns = [str(col) + '_F1' for col in f1.columns]
            f1 = f1.astype(float)

            # Calculate accuracy and f1 stats
            AC = np.mean(accuracies)
            AC_std = np.std(accuracies)
            MacF1 = f1['M_F1'].mean()
            MacF1_std = f1['M_F1'].std()

            print('Save the predicted values:')
            predicted.to_csv(save_path + short_name +
                             "_MeanShift_%s_%s_validation_prediction.txt" %
                             (dataset, bandwidth),
                             index=True,
                             header=True,
                             sep="\t")
            predicted_test.to_csv(save_path + short_name +
                                  "_MeanShift_%s_%s_test_prediction.txt" %
                                  (dataset, bandwidth),
                                  index=True,
                                  header=True,
                                  sep="\t")
            print(
                "\nCluster results for cross validation: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC, AC_std, MacF1, MacF1_std))

            # Unpack results for test
            f1_ho = pd.DataFrame(f1_array_ho)
            f1_ho.columns = f1_ho.iloc[0]
            f1_ho = f1_ho[1:]
            f1_ho.columns = [str(col) + '_F1' for col in f1_ho.columns]
            f1_ho = f1_ho.astype(float)
            AC_ho = np.mean(accuracies_ho)
            AC_std_ho = np.std(accuracies_ho)
            MacF1_ho = f1_ho['M_F1'].mean()
            MacF1_std_ho = f1_ho['M_F1'].std()
            print(
                "\nCluster results for test: \nAccuracy: %03f (+/- stdev %03f)\nF1 (macro): %03f (+/- stdev %03f)\n"
                % (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))

            # Save detailed results file
            n_features = df.shape[1] - 1
            out = open(
                save_path + short_name + "_Birch_%s_%s_results.txt" %
                (dataset, n_clusters), 'w')

            out.write('\n\nResults for prediction on validation set:\n')
            out.write(
                'Metric\tMean\tSD\nAccuracy\t%05f\t%05f\nF1_macro\t%05f\t%05f\n'
                % (AC, AC_std, MacF1, MacF1_std))
            for cla in f1.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1[cla]), np.std(f1[cla])))

            # Add results for test
            out.write('\n\nResults for test set:\n')
            out.write(
                'HO Accuracy\t%05f +/-%05f\nHO F1_macro\t%05f +/-%05f\n' %
                (AC_ho, AC_std_ho, MacF1_ho, MacF1_std_ho))
            for cla in f1_ho.columns:
                if 'M_F1' not in cla:
                    out.write('%s\t%05f\t%05f\n' %
                              (cla, np.mean(f1_ho[cla]), np.std(f1_ho[cla])))

            out.close()
Пример #24
0
def Affinity_Propagation(x, y):
    aff = AffinityPropagation().fit(x)
    pred = aff.predict(x)
    NMI = metrics.normalized_mutual_info_score(y, pred)
    print("Affinity_Propagation:", NMI)
#agglomerative clustering in sklearn gives us lots of distance/proximity metrics to choose from
#these are defined in the affinity variable

#linkage parameter had to be average or complete rather than ward in order to use non-euclidean distance metrics
agglo = AgglomerativeClustering(n_clusters=4, affinity='l1', linkage='average')
cAssign = agglo.fit_predict(df)

plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10')

# In[73]:

#http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AffinityPropagation.html#sklearn.cluster.AffinityPropagation

affProp = AffinityPropagation()
affProp.fit(df)
cAssign = affProp.predict(df)
plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10')

# In[74]:

#documentation: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.SpectralClustering.html#sklearn.cluster.SpectralClustering

from sklearn.cluster import SpectralClustering
spectral = SpectralClustering()
cAssign = spectral.fit_predict(df)
plt.scatter(df['x'], df['y'], c=cAssign, cmap='tab10')

# In[80]:

#http://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html#sklearn.mixture.GaussianMixture
f_list_2.append(sp_centroid)
f_list_2.append(sp_bandwidth)
f_list_2.append(sp_contrast)
f_list_2.append(sp_rolloff)
f_np_2 = np.array(f_list_2)
f_np_2 = np.transpose(f_np_2)

f_np_3 = np.array(mfcc)
f_np_4 = np.array(chroma_stft)

master = np.concatenate([f_np_1, f_np_2, f_np_3, f_np_4], axis=1)

cluster_obj = AffinityPropagation().fit(master)
#cluster_obj = KMeans(n_clusters = 2 ,random_state=0).fit(master)
#print("Number of clusters : " + str(len(cluster_obj.cluster_centers_indices_)))
res = cluster_obj.predict(master)
#print(cluster_obj.get_params())
s = res[0]
t = 0.0
time = []
speaker = []
time.append(t)
speaker.append(s)
for u in range(0, len(res), 1):
    if (res[u] == s):
        t = t + 0.2
    else:
        t = t + 0.2
        s = res[u]
        speaker.append(s)
        time.append(t)
Пример #27
0
def ap_cluster_k(x,
                 K,
                 preference_init=-1.0,
                 max_iter=30,
                 c=None,
                 iter_finetune=10):
    '''
    Clustering of x by affinity propagation which the number of cluster is K.

    args:
        x (ndarray):
            Data matrix.
        K (int):
            Target number of clusters.
        max_iter (int):
            Number of trials for bisection search.
        c (ndarray, optional):
            Class labels of x. If this parameter is specified, the function
            try to find the better solution by random search.
        iter_finetune (int):
            Number of steps for the random search.
    '''

    # first, search rough lower bound of the preference
    assert preference_init < 0, "preference_init must be negative."
    p = float(preference_init)  # preference parameter
    p_upper = 0
    for i in range(5):
        ap = AffinityPropagation(preference=p).fit(y)
        k_current = len(ap.cluster_centers_indices_)
        if k_current > K:
            p_upper = p
            k_upper = k_current
            p *= 10
        else:
            p_lower = p
            k_lower = k_current
            break
    else:
        raise RuntimeError("Can't find initial lower bound for preference."
                           " Try another value of p_initial.")

    # search the preference by bisection method
    for i in range(max_iter):
        p = (p_lower + p_upper) / 2
        ap = AffinityPropagation(preference=p).fit(y)
        k_current = len(ap.cluster_centers_indices_)
        print('K = {}, k_current = {}, p = {}'.format(K, k_current, p))
        print('{}:{}, {}:{}, {}:{}'.format(k_lower, p_lower, k_current, p,
                                           k_upper, p_upper))

        # if the current k goes out of bounds then retry with perturbed p
        while k_current < k_lower or k_current > k_upper:
            print("retry")
            p += np.random.uniform(p_lower, p_upper) / 10
            ap = AffinityPropagation(preference=p).fit(y)
            k_current = len(ap.cluster_centers_indices_)
            print('K = {}, k_current = {}, p = {}'.format(K, k_current, p))
            print('{}:{}, {}:{}, {}:{}'.format(k_lower, p_lower, k_current, p,
                                               k_upper, p_upper))

        if k_current < K:
            p_lower = p
            k_lower = k_current
        elif k_current > K:
            p_upper = p
            k_upper = k_current
        else:
            break
    else:
        raise RuntimeError("Can't find a preference to form K clusters."
                           " Try another value of p_initial.")

    if c is None:
        return ap

    # Search further better preference in terms of NMI score by random search
    p_best = p
    score_best = normalized_mutual_info_score(c, ap.predict(y))
    print('initial score:', score_best)
    print()
    for i in range(iter_finetune):
        p = np.random.normal(p_best, (p_upper - p_lower) / 2)
        if p < p_lower or p > p_upper:  # where p is rejected
            print('reject')
            continue
        ap = AffinityPropagation(preference=p).fit(y)
        k_current = len(ap.cluster_centers_indices_)
        if k_current < K and p > p_lower:
            p_lower = p
        elif k_current > K and p < p_upper:
            p_upper = p
        else:  # wgere k_current is K
            score = normalized_mutual_info_score(c, ap.predict(y))
            if score > score_best:
                print("update p {} -> {}".format(p_best, p))
                p_best = p
                score_best = score
        print('p: {}, {}, {}'.format(p_lower, p, p_upper))
        print('score: {}'.format(score_best))
        print()
    return AffinityPropagation(preference=p_best).fit(y)
Пример #28
0
    metrics.f1_score(y_test,
                     y_predicted,
                     average='macro',
                     zero_division='warn',
                     labels=np.unique(y_predicted)))

print(classification_report(y_test, y_predicted))

#applying AffinityPropagation Clustering Algorithm
print('test_data.shape')
print(test_data.shape)
print('y_test.shape')
print(y_test.shape)
clustering = AffinityPropagation(random_state=5).fit(df)

y_predicted = clustering.predict(test_data)

print(' y predicted shape')
print(y_predicted.shape)

print("y_test shape")
print(y_test.shape)
print(y_test)

print("y_predicted shape")
print(y_predicted.shape)
print(y_predicted)

print(y_test.shape)
print(type(y_test))
print(y_predicted.shape)
path = 'data/' + filename + '.txt'
data = pd.read_csv(path, delimiter='\t')

columns = ['X', 'Y', 'Z']
X = data[columns]

data.isnull().sum()

from sklearn.preprocessing import MinMaxScaler

obj = MinMaxScaler()
X_scaled = obj.fit_transform(X)

from sklearn.cluster import AffinityPropagation

model = AffinityPropagation()
model.fit(X_scaled)
y_pred = model.predict(X_scaled)

clusters = np.unique(y_pred)
data = X.join(pd.DataFrame({'Cluster': y_pred}))
data.to_csv('data/clusters/' + filename + '.csv', index=None)

# clusters = np.unique(y_pred)
# for cluster in clusters:
#     row_idx = np.where(y_pred == cluster)
#     X.loc[row_idx].to_csv('clusters/' + str(cluster) + '_' + filename + '.csv', index=None)
#     plt.scatter(X.loc[row_idx]['X'],X.loc[row_idx]['Y'])

# plt.show()