예제 #1
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs)

    num_fea = 100    # specify the number of selected features
    num_cluster = 20    # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
예제 #2
0
def test_spec():
    # load data
    mat = scipy.io.loadmat('./data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    kwargs = {'style': 0}
    pipeline = []
    spec_partial = partial(SPEC.spec, **kwargs)
    pipeline.append(
        ('select top k', SelectKBest(score_func=spec_partial, k=num_fea)))
    model = Pipeline(pipeline)

    # set y param to be 0 to demonstrate that this works in unsupervised sense.
    selected_features = model.fit_transform(X, y=np.zeros(X.shape[0]))

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print(('NMI:', float(nmi_total) / 20))
    print(('ACC:', float(acc_total) / 20))
예제 #3
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    p = 0.1    # specify the threshold p to be 0.1
    num_cluster = 2    # specify the number of clusters to be 2

    # perform feature selection and obtain the dataset on the selected features
    selected_features = low_variance.low_variance_feature_selection(X, p*(1-p))

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
예제 #4
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # perform evaluation on clustering task
    num_fea = 100    # number of selected features
    num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
def eval_subset(train, test):
    n_clusters = len(np.unique(train[2]))

    clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)
    clf.fit(train[0], train[2])
    DTacc = float(clf.score(test[0], test[2]))

    clf = KNeighborsClassifier(n_neighbors=1, algorithm='brute', n_jobs=1)
    clf.fit(train[0], train[2])
    acc = float(clf.score(test[0], test[2]))

    LR = LinearRegression(n_jobs=-1)
    LR.fit(train[0], train[1])
    MSELR = float(((LR.predict(test[0]) - test[1])**2).mean())

    MSE = float((((decoder((train[0], train[1]),
                           (test[0], test[1])) - test[1])**2).mean()))

    max_iters = 10
    cnmi, cacc = 0.0, 0.0
    for iter in range(max_iters):
        nmi, acc = unsupervised_evaluation.evaluation(train[0],
                                                      n_clusters=n_clusters,
                                                      y=train[2])
        cnmi += nmi / max_iters
        cacc += acc / max_iters
    print('nmi = {:.3f}, acc = {:.3f}'.format(cnmi, cacc))
    print('acc = {:.3f}, DTacc = {:.3f}, MSELR = {:.3f}, MSE = {:.3f}'.format(
        acc, DTacc, MSELR, MSE))
    return MSELR, MSE, acc, DTacc, float(cnmi), float(cacc)
예제 #6
0
def test_low_variance():
    # load data
    from functools import partial
    mat = scipy.io.loadmat('./data/BASEHOCK.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    p = 0.1    # specify the threshold p to be 0.1
    num_cluster = 2    # specify the number of clusters to be 2    
    
    # build pipeline
    pipeline = []
    
    # this is equivalent to `pipeline.append(('low_variance', VarianceThreshold(threshold=p*(1-p))))`
    pipeline.append(('low_variance', low_variance.low_variance_feature_selection(threshold=p*(1-p))))
    model = Pipeline(pipeline)    
    # set y param to be 0 to demonstrate that this works in unsupervised sense.
    # perform feature selection and obtain the dataset on the selected features
    selected_features = model.fit_transform(X, y=np.zeros(X.shape[0]))  
        
    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print(('NMI:', float(nmi_total)/20))
    print(('ACC:', float(acc_total)/20))
    assert_true(float(nmi_total)/20 > 0.0)
    assert_true(float(acc_total)/20 > 0.5)
예제 #7
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # specify the second ranking function which uses all except the 1st eigenvalue
    kwargs = {'style': 0}

    # obtain the scores of features
    score = SPEC.spec(X, **kwargs)

    # sort the feature scores in an descending order according to the feature scores
    idx = SPEC.feature_ranking(score, **kwargs)

    # perform evaluation on clustering task
    num_fea = 100    # number of selected features
    num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', old_div(float(nmi_total),20))
    print('ACC:', old_div(float(acc_total),20))
예제 #8
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # specify the second ranking function which uses all except the 1st eigenvalue
    kwargs = {'style': 0}

    # obtain the scores of features
    score = SPEC.spec(X, **kwargs)

    # sort the feature scores in an descending order according to the feature scores
    idx = SPEC.feature_ranking(score, **kwargs)

    # perform evaluation on clustering task
    num_fea = 100    # number of selected features
    num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
예제 #9
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # perform evaluation on clustering task
    num_fea = 100    # number of selected features
    num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
예제 #10
0
파일: utils.py 프로젝트: rsyed0/TSFS
def evaluate_clustering(selected_features,y):
    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = np.zeros(20)
    acc_total = np.zeros(20)
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=len(np.unique(y)), y=y)
        nmi_total[i]= nmi
        acc_total[i]= acc

    # output the average NMI and average ACC
    return (np.mean(nmi_total), np.std(nmi_total)), (np.mean(acc_total),np.std(acc_total))
예제 #11
0
def test_lap_score():
    # load data
    from functools import partial
    mat = scipy.io.loadmat('./data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs_W)
    num_fea = 100  # number of selected features

    pipeline = []

    # partial function required for SelectKBest to work correctly.
    lap_score_partial = partial(lap_score.lap_score, W=W)
    pipeline.append(
        ('select top k', SelectKBest(score_func=lap_score_partial, k=num_fea)))
    model = Pipeline(pipeline)

    # set y param to be 0 to demonstrate that this works in unsupervised sense.
    selected_features = model.fit_transform(X, y=np.zeros(X.shape[0]))
    print(selected_features.shape)

    # perform evaluation on clustering task
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print(('NMI:', float(nmi_total) / 20))
    print(('ACC:', float(acc_total) / 20))

    assert_true(float(nmi_total) / 20 > 0.5)
    assert_true(float(acc_total) / 20 > 0.5)
예제 #12
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', float(nmi_total) / 20)
    print('ACC:', float(acc_total) / 20)
예제 #13
0
def main():
    # load data
    mat = scipy.io.loadmat("../data/COIL20.mat")
    X = mat["X"]  # data
    X = X.astype(float)
    y = mat["Y"]  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1}
    W = construct_W.construct_W(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print "NMI:", float(nmi_total) / 20
    print "ACC:", float(acc_total) / 20
import scipy.io
#from skfeature.function.statistical_based import low_variance
from skfeature.utility import unsupervised_evaluation

mat = scipy.io.loadmat('../Datasets/BASEHOCK.mat')
X = mat['X']  # data
X = X.astype(float)
y = mat['Y']  # label
y = y[:, 0]

p = 0.1  # specify the threshold p to be 0.1
num_cluster = 2  # specify the number of clusters to be 2

# perform feature selection and obtain the dataset on the selected features
from Statistical_Based.Low_Variance.LowVarianceZeal import Low_Variance_FS
selected_features = Low_Variance_FS(X, p * (1 - p))

# perform kmeans clustering based on the selected features and repeats 20 times
nmi_total = 0
acc_total = 0
for i in range(0, 20):
    nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features,
                                                  n_clusters=num_cluster,
                                                  y=y)
    nmi_total += nmi
    acc_total += acc

# output the average NMI and average ACC
print('NMI:', float(nmi_total) / 20)
print('ACC:', float(acc_total) / 20)