Пример #1
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs)

    num_fea = 100    # specify the number of selected features
    num_cluster = 20    # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
Пример #2
0
def mcfs(X, n_selected_features, **kwargs):
    """
    This function implements unsupervised feature selection for multi-cluster data.

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            affinity matrix
        n_clusters: {int}
            number of clusters (default is 5)

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix

    Reference
    ---------
    Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010.
    """

    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    # default number of clusters is 5
    if 'n_clusters' not in kwargs:
        n_clusters = 5
    else:
        n_clusters = kwargs['n_clusters']

    # solve the generalized eigen-decomposition problem and get the top K
    # eigen-vectors with respect to the smallest eigenvalues
    W = W.toarray()
    W = (W + W.T) / 2
    W_norm = np.diag(np.sqrt(1 / W.sum(1)))
    W = np.dot(W_norm, np.dot(W, W_norm))
    WT = W.T
    W[W < WT] = WT[W < WT]
    eigen_value, ul = scipy.linalg.eigh(a=W)
    Y = np.dot(W_norm, ul[:, -1 * n_clusters - 1:-1])

    # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d
    n_sample, n_feature = X.shape
    W = np.zeros((n_feature, n_clusters))
    for i in range(n_clusters):
        clf = linear_model.Lars(n_nonzero_coefs=n_selected_features)
        clf.fit(X, Y[:, i])
        W[:, i] = clf.coef_
    return W
Пример #3
0
def mcfs(X, n_selected_features, **kwargs):
    """
    This function implements unsupervised feature selection for multi-cluster data.

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            affinity matrix
        n_clusters: {int}
            number of clusters (default is 5)

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix

    Reference
    ---------
    Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010.
    """

    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    # default number of clusters is 5
    if 'n_clusters' not in kwargs:
        n_clusters = 5
    else:
        n_clusters = kwargs['n_clusters']

    # solve the generalized eigen-decomposition problem and get the top K
    # eigen-vectors with respect to the smallest eigenvalues
    W = W.toarray()
    W = (W + W.T) / 2
    W_norm = np.diag(np.sqrt(1 / W.sum(1)))
    W = np.dot(W_norm, np.dot(W, W_norm))
    WT = W.T
    W[W < WT] = WT[W < WT]
    eigen_value, ul = scipy.linalg.eigh(a=W)
    Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1])

    # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d
    n_sample, n_feature = X.shape
    W = np.zeros((n_feature, n_clusters))
    for i in range(n_clusters):
        clf = linear_model.Lars(n_nonzero_coefs=n_selected_features)
        clf.fit(X, Y[:, i])
        W[:, i] = clf.coef_
    return W
Пример #4
0
def calc_lap_score(data):
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(data, **kwargs_W)

    return lap_score.lap_score(data, W=W)
    def utilize_selection_method(self, options):
        logging.info('     Unsupervised Feature Selection : Start')
        self.parse_options(options)
        normalize_feature = SupervisedFs.normalize_feature(self.data_feature)
        feature_amount = len(self.data_feature[0])
        selection_result = {}

        if self.options['v'] == 1:
            widget = ['Calculating Variance             : ', pb.Percentage(), ' ', pb.Bar(marker=pb.RotatingMarker()),
                      ' ', pb.ETA()]
            timer = pb.ProgressBar(widgets=widget, maxval=feature_amount).start()
            variance = []
            for n in range(0, feature_amount):
                variance.append([np.var(normalize_feature[:, n]), n+1])
                timer.update(n)
            timer.finish()
            selection_result['variance'] = sorted(variance, reverse=True)

        if self.options['l'] == 1:
            logging.info('   -----Calculating Laplacian score---- ')
            kwargs_w = {'metric': 'euclidean', 'neighbor': 'knn', 'weight_mode': 'heat_kernel', 'k': 5, 't': 1}
            W = construct_W.construct_W(self.data_feature, **kwargs_w)
            score = lap_score.lap_score(self.data_feature, W=W)
            lap = []
            for n in range(0, feature_amount):
                lap.append([score[n], n+1])
            selection_result['laplacian'] = sorted(lap, reverse=False)
            logging.info('   -----Calculating Laplacian score---- ==> Done')

        if self.options['s'] == 1:
            logging.info('   -----Calculating Spectral score---- ')
            kwargs_w = {'metric': 'euclidean', 'neighbor': 'knn', 'weight_mode': 'heat_kernel', 'k': 5, 't': 1}
            W = construct_W.construct_W(self.data_feature, **kwargs_w)
            kwargs_s = {'style': 2, 'W': W}
            score = SPEC.spec(self.data_feature, **kwargs_s)
            spec = []
            for n in range(0, feature_amount):
                spec.append([score[n], n+1])
            selection_result['spectral'] = sorted(spec, reverse=True)
            logging.info('   -----Calculating Spectral score---- ==> Done')
        return selection_result
Пример #6
0
def lap_score(X, **kwargs):
    """
    This function implements the laplacian score feature selection, steps are as follows:
    1. Construct the affinity matrix W if it is not specified
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat)

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            input affinity matrix

    Output
    ------
    score: {numpy array}, shape (n_features,)
        laplacian score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    """

    # if 'W' is not specified, use the default W
    if 'W' not in kwargs.keys():
        W = construct_W(X)

    else:
        # construct the affinity matrix W
        W = kwargs['W']

    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000

    # compute laplacian score for all features
    score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :]
    return np.transpose(score)
Пример #7
0
def calc_NDFS(data, n_clusters=20):
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(data, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(data, W=W, n_clusters=n_clusters)
    return (Weight * Weight).sum(1)
def SKF_lap(X, y):
    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs_W)
    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)
    return lap_score.feature_ranking(score)
Пример #9
0
def calc_MCFS(data, n_features, n_clusters=20):
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(data, **kwargs_W)

    return MCFS.mcfs(data,
                     n_selected_features=n_features,
                     W=W,
                     n_clusters=n_clusters).max(1)
Пример #10
0
def test_lap_score():
    # load data
    from functools import partial
    mat = scipy.io.loadmat('./data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs_W)
    num_fea = 100  # number of selected features

    pipeline = []

    # partial function required for SelectKBest to work correctly.
    lap_score_partial = partial(lap_score.lap_score, W=W)
    pipeline.append(
        ('select top k', SelectKBest(score_func=lap_score_partial, k=num_fea)))
    model = Pipeline(pipeline)

    # set y param to be 0 to demonstrate that this works in unsupervised sense.
    selected_features = model.fit_transform(X, y=np.zeros(X.shape[0]))
    print(selected_features.shape)

    # perform evaluation on clustering task
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print(('NMI:', float(nmi_total) / 20))
    print(('ACC:', float(acc_total) / 20))

    assert_true(float(nmi_total) / 20 > 0.5)
    assert_true(float(acc_total) / 20 > 0.5)
Пример #11
0
    def spec(self, community: int, attributes: list, percentile=0):
        result = []
        percentile = 0.1
        attributes = list(
            filter(lambda x: x != 'nodeId' and x != 'id' and x != 'community',
                   attributes))
        print(len(attributes))
        print('Attributes ', attributes)
        nodes_amount = self.get_nodes_amount_of_community(community)
        community_as_matrix = np.empty((nodes_amount, len(attributes)))
        community_nodes = self.get_community_nodes(community)
        node_index = 0
        for node in community_nodes:
            for attribute_index in range(len(attributes)):
                community_as_matrix[node_index, attribute_index] = node[
                    attributes[attribute_index]]
            node_index += 1

        if nodes_amount >= 5:
            w_matrix = construct_W(community_as_matrix)
        else:
            w_matrix = construct_W(community_as_matrix, k=(nodes_amount - 1))

        # w_matrix = construct_W(community_as_matrix)

        scores = SPEC.spec(community_as_matrix, W=w_matrix)
        ranked_attributes = feature_ranking(scores)
        boundary = len(attributes) * percentile
        # boundary = 1
        print('Percentile ', percentile)
        print('Boundary ', boundary)
        print('Ranked attributes ', ranked_attributes)
        for i in range(len(attributes)):
            if ranked_attributes[i] < boundary:
                result.append(attributes[i])

        return result
Пример #12
0
def lap_score(X, **kwargs):
    """
    This function implements the laplacian score feature selection, steps are as follows:
    1. Construct the affinity matrix W if it is not specified
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat)

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            input affinity matrix

    Output
    ------
    score: {numpy array}, shape (n_features,)
        laplacian score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    """

    # if 'W' is not specified, use the default W
    if 'W' not in kwargs.keys():
        W = construct_W(X)
    # construct the affinity matrix W
    W = kwargs['W']
    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000

    # compute laplacian score for all features
    score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]
    return np.transpose(score)
Пример #13
0
def fisher_score(X, y):
    """
    This function implements the fisher score feature selection, steps are as follows:
    1. Construct the affinity matrix W in fisher score way
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    score: {numpy array}, shape (n_features,)
        fisher score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012.
    """

    # Construct weight matrix W in a fisherScore way
    kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
    W = construct_W(X, **kwargs)

    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000
    lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]

    # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
    score = 1.0/lap_score - 1
    return np.transpose(score)
Пример #14
0
def fisher_score(X, y):
    """
    This function implements the fisher score feature selection, steps are as follows:
    1. Construct the affinity matrix W in fisher score way
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    score: {numpy array}, shape (n_features,)
        fisher score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012.
    """

    # Construct weight matrix W in a fisherScore way
    kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
    W = construct_W(X, **kwargs)

    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000
    lap_score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :]

    # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
    score = 1.0 / lap_score - 1
    return np.transpose(score)
def SKF_ndfs(X, y):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs)
    num_cluster = len(
        set(y)
    )  # specify the number of clusters, it is usually set as the number of classes in the ground truth
    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=num_cluster)
    return sparse_learning.feature_ranking(Weight)
Пример #16
0
def mcfs(trnin, num_fea):

    from skfeature.utility import construct_W
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(trnin, **kwargs_W)

    from skfeature.function.sparse_learning_based import MCFS
    score = MCFS.mcfs(trnin, num_fea, W=W)
    idx = MCFS.feature_ranking(score)
    selfea = idx[0:num_fea]
    return selfea
def get_lap_score(data, k=5, t=1,top_feature = 30):
    kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":k,'t':t}
    W = construct_W.construct_W(data, **kwargs_W)
    score = lap_score.lap_score(data, W=W)
    #print(score)
    ranking = lap_score.feature_ranking(score)
    #print(idx)
    
    dfscores = pd.DataFrame(score)
    dfcolumns = pd.DataFrame(data.columns)
    #df_rank = pd.DataFrame(idx)
    
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Feature','Score']  #naming the dataframe columns
    #print(featureScores.nlargest(k,'Score'))  #print 20 best features
    result = featureScores.nlargest(top_feature,'Score')
    
    return result, ranking
Пример #18
0
def laplacian_score(X, y=None, **kwargs):
    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    return idx
Пример #19
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', float(nmi_total) / 20)
    print('ACC:', float(acc_total) / 20)
Пример #20
0
def Laplacian_score(diheds):
  import scipy.io
  import numpy
  import os
  #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0')
  from skfeature.function.similarity_based import lap_score
  from skfeature.utility import construct_W
  from numpy import mean
  kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
  idx = []
  #change the path for every system to be run.
  #os.chdir('/home/anu/Downloads/traj_benz_trypsin/')
  for i in range(len(diheds)):
   X= diheds[i]
   W = construct_W.construct_W(X, **kwargs_W)
   score = lap_score.lap_score(X, W=W)
   idx.append(score)
  col_mean = mean(idx, axis =0)
  imp_features = numpy.argsort(col_mean)
  return col_mean,imp_features
Пример #21
0
 def plot_ls_after_vt_filtering(self, threshold):
     data = self.test_reddy_dataset.expression_data.copy()
     vt_data = self.variance_threshold_selector(data, threshold)
     # perform ls filtering
     vt_numpy = vt_data.to_numpy()
     # construct affinity matrix
     kwargs_W = {
         "metric": "cosine",
         "neighbor_mode": "knn",
         "weight_mode": "cosine",
         "k": 40,
         't': 500
     }
     print(
         "We plot the Laplacian scores of the features using the following affinity matrix parameters: "
         + str(kwargs_W))
     W = construct_W.construct_W(vt_numpy, **kwargs_W)
     # compute lap score of each remaining features
     score = lap_score.lap_score(vt_numpy, W=W)
     self.plot_lap_scores(score)
def SKF_mcfs(X, y):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs)
    num_fea = X.shape[1]  # specify the number of selected features
    num_cluster = len(
        set(y)
    )  # specify the number of clusters, it is usually set as the number of classes in the ground truth
    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X,
                       n_selected_features=num_fea,
                       W=W,
                       n_clusters=num_cluster)
    return MCFS.feature_ranking(Weight)
Пример #23
0
 def lap_score_filtering(self, vt_data, num_features):
     vt_numpy = vt_data.to_numpy()
     # construct affinity matrix
     kwargs_W = {
         "metric": "cosine",
         "neighbor_mode": "knn",
         "weight_mode": "cosine",
         "k": 40,
         't': 500
     }
     print(
         "We perform Laplacian score filtering using the following parameters: "
         + str(kwargs_W))
     W = construct_W.construct_W(vt_numpy, **kwargs_W)
     score = lap_score.lap_score(vt_numpy, W=W)
     idx = lap_score.feature_ranking(score)  # rank features
     filtered_data = vt_data.iloc[:, idx[0:num_features]].copy()
     print("\nThe data now has " + str(len(filtered_data.T)) +
           " features after Laplacian score filtering.")
     return filtered_data
Пример #24
0
def mcfs_score(diheds):
  import scipy.io
  import numpy
  from numpy import mean
  import os 
  #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0')
  from skfeature.function.sparse_learning_based import MCFS
  from skfeature.utility import construct_W
  from skfeature.utility import unsupervised_evaluation
  idx = []
  kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
  #change the path for every system to be run.
  #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein')
  for i in range(0,len(diheds),5):
   X= diheds[i]
   W = construct_W.construct_W(X, **kwargs)
   score = MCFS.mcfs(X, n_selected_features=20, W=W, n_clusters=20)
     
   idx.append(score)
  col_mean = mean(idx, axis =0)
  imp_features=MCFS.feature_ranking(col_mean)
  return col_mean,imp_features
Пример #25
0
def MCFS_FS(X_train, k):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X_train, **kwargs)
    num_fea_ = k  # specify the number of selected features
    num_cluster = 20  # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X_train,
                       n_selected_features=num_fea_,
                       W=W,
                       n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)
    return (idx, Weight)
Пример #26
0
def MCFS(X, y=None, **kwargs):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs)

    num_cluster = len(np.unique(y))

    # obtain the feature weight matrix
    Weight = MCFS_CLASS.mcfs(X,
                             n_selected_features=X.shape[1],
                             W=W,
                             n_clusters=num_cluster)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS_CLASS.feature_ranking(Weight)

    return idx
Пример #27
0
def main():
    # load data
    mat = scipy.io.loadmat("../data/COIL20.mat")
    X = mat["X"]  # data
    X = X.astype(float)
    y = mat["Y"]  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1}
    W = construct_W.construct_W(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print "NMI:", float(nmi_total) / 20
    print "ACC:", float(acc_total) / 20
Пример #28
0
    def predict(self, X):
        """
        :param X: shape [n_row*n_clm, n_band]
        :return:
        """
        # n_row, n_column, __n_band = X.shape
        # XX = X.reshape((n_row * n_column, -1))  # n_sample * n_band
        XX = X

        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(XX, **kwargs_W)

        # obtain the scores of features
        score = lap_score.lap_score(X, W=W)

        # sort the feature scores in an ascending order according to the feature scores
        idx = lap_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:self.n_band]]

        # selected_features.reshape((self.n_band, n_row, n_column))
        # selected_features = np.transpose(selected_features, axes=(1, 2, 0))
        return selected_features
Пример #29
0
    def bench(self, X, X_norm, y, n=2):
        num_feats = 20
        output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()}

        # ----------------------------------------------------------------
        # CFS
        # start = time.perf_counter()
        # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0]
        # print(idx)
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('CFS')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))

        # LA: Laplacian Score
        start = time.perf_counter()
        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W)
        score = lap_score.lap_score(X_norm.to_numpy(), W=W)
        idx = lap_score.feature_ranking(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('Laplacian Score')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # FCBF: Feature correlation based filter
        # start = time.perf_counter()
        # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0]
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('FCBF')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))
        # print(output_data)
        # output_data['method'].append('FCBF')
        # output_data['time'].append(9999999)
        # output_data['features'].append([])
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(0.0)

        # UDFS: Unsupervised Discriminative Feature Selection
        start = time.perf_counter()
        Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n)
        idx = feature_ranking(Weight)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('UDFS')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # SPEC: Spectral Feature Selection
        start = time.perf_counter()
        score = spec(X_norm.to_numpy())
        idx = feature_ranking_spec(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('SPEC')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats)
        output_data['method'].append('MRMR(MIQ)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats)
        output_data['method'].append('MRMR(MID)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # recursive feature elimination(RFE):

        from sklearn.feature_selection import RFE
        from sklearn.linear_model import LogisticRegression
        rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
        start = time.perf_counter()
        rfe_selector.fit(X_norm, y)
        rfe_support = rfe_selector.get_support()
        rfe_feature = X_norm.loc[:, rfe_support].columns.tolist()
        output_data['method'].append('RFE')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(rfe_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(rfe_feature, X))
        print(output_data)

        # ----------------------------------------------------------------
        # Lasso: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.linear_model import LogisticRegression

        embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
        start = time.perf_counter()
        embeded_lr_selector.fit(X_norm, y)

        embeded_lr_support = embeded_lr_selector.get_support()
        embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist()
        output_data['method'].append('Lasso')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_lr_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X))
        print(output_data)
        print(str(len(embeded_lr_feature)), 'selected features')

        # -----------------------------------------------------------------------------
        # Tree - based: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.ensemble import RandomForestClassifier

        embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
        start = time.perf_counter()
        embeded_rf_selector.fit(X_norm, y)

        embeded_rf_support = embeded_rf_selector.get_support()
        embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist()
        output_data['method'].append('Tree_Based_RF')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_rf_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X))
        print(output_data)
        print(str(len(embeded_rf_feature)), 'selected features')

        # -------------------------------------------------------------------------------
        # also tree based:

        from sklearn.feature_selection import SelectFromModel
        from lightgbm import LGBMClassifier

        lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
                              reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

        embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
        start = time.perf_counter()
        embeded_lgb_selector.fit(X_norm, y)

        embeded_lgb_support = embeded_lgb_selector.get_support()
        embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist()
        output_data['method'].append('Tree_Based_lightGBM')
        output_data['time'].append(time.perf_counter() - start)
        output_data['supervised'].append(True)
        output_data['features'].append(embeded_lgb_feature)
        output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X))
        print(output_data)
        print(str(len(embeded_lgb_feature)), 'selected features')

        return output_data
Пример #30
0
def ndfs(X, **kwargs):
    """
    This function implement unsupervised feature selection using nonnegative spectral analysis, i.e.,
    min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2
    s.t. F >= 0
    
    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape {n_samples, n_samples}
            affinity matrix
        alpha: {float}
            Parameter alpha in objective function
        beta: {float}
            Parameter beta in objective function
        gamma: {float}
            a very large number used to force F^T F = I
        F0: {numpy array}, shape (n_samples, n_clusters)
            initialization of the pseudo label matirx F, if not provided
        n_clusters: {int}
            number of clusters
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix
        
    Reference: 
        Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012.
    """

    # default gamma is 10e8
    if 'gamma' not in kwargs:
        gamma = 10e8
    else:
        gamma = kwargs['gamma']
    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    if 'alpha' not in kwargs:
        alpha = 1
    else:
        alpha = kwargs['alpha']
    if 'beta' not in kwargs:
        beta = 1
    else:
        beta = kwargs['beta']
    if 'F0' not in kwargs:
        if 'n_clusters' not in kwargs:
            print >> sys.stderr, "either F0 or n_clusters should be provided"
        else:
            # initialize F
            n_clusters = kwargs['n_clusters']
            F = kmeans_initialization(X, n_clusters)
    else:
        F = kwargs['F0']
    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    n_samples, n_features = X.shape

    # initialize D as identity matrix
    D = np.identity(n_features)
    I = np.identity(n_samples)

    # build laplacian matrix
    L = np.array(W.sum(1))[:, 0] - W

    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update W
        T = np.linalg.inv(
            np.dot(X.transpose(), X) + beta * D + 1e-6 * np.eye(n_features))
        W = np.dot(np.dot(T, X.transpose()), F)
        # update D
        temp = np.sqrt((W * W).sum(1))
        temp[temp < 1e-16] = 1e-16
        temp = 0.5 / temp
        D = np.diag(temp)
        # update M
        M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose()))
        M = (M + M.transpose()) / 2
        # update F
        denominator = np.dot(M,
                             F) + gamma * np.dot(np.dot(F, F.transpose()), F)
        temp = np.divide(gamma * F, denominator)
        F = F * np.array(temp)
        temp = np.diag(np.sqrt(np.diag(1 /
                                       (np.dot(F.transpose(), F) + 1e-16))))
        F = np.dot(F, temp)

        # calculate objective function
        obj[iter_step] = np.trace(np.dot(np.dot(
            F.transpose(), M), F)) + gamma / 4 * np.linalg.norm(
                np.dot(F.transpose(), F) - np.identity(n_clusters), 'fro')
        if verbose:
            print 'obj at iter ' + str(iter_step + 1) + ': ' + str(
                obj[iter_step])

        if iter_step >= 1 and math.fabs(obj[iter_step] -
                                        obj[iter_step - 1]) < 1e-3:
            break
    return W
Пример #31
0
def trace_ratio(X, y, n_selected_features, **kwargs):
    """
    This function implements the trace ratio criterion for feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        style: {string}
            style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way
            style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, False if not

    Output
    ------
    feature_idx: {numpy array}, shape (n_features,)
        the ranked (descending order) feature index based on subset-level score
    feature_score: {numpy array}, shape (n_features,)
        the feature-level score
    subset_score: {float}
        the subset-level score

    Reference
    ---------
    Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008.
    """

    # if 'style' is not specified, use the fisher score way to built two affinity matrix
    if 'style' not in kwargs.keys():
        kwargs['style'] = 'fisher'
    # get the way to build affinity matrix, 'fisher' or 'laplacian'
    style = kwargs['style']
    n_samples, n_features = X.shape

    # if 'verbose' is not specified, do not output the value of objective function
    if 'verbose' not in kwargs:
        kwargs['verbose'] = False
    verbose = kwargs['verbose']

    if style is 'fisher':
        kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        L_within = np.eye(n_samples) - W_within
        L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples
        L_between = L_within - L_tmp

    if style is 'laplacian':
        kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        D_within = np.diag(np.array(W_within.sum(1))[:, 0])
        L_within = D_within - W_within
        W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within)
        D_between = np.diag(np.array(W_between.sum(1)))
        L_between = D_between - W_between

    # build X'*L_within*X and X'*L_between*X
    L_within = (np.transpose(L_within) + L_within)/2
    L_between = (np.transpose(L_between) + L_between)/2
    S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X))
    S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X))

    # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X'
    S_within = (np.transpose(S_within) + S_within)/2
    # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X'
    S_between = (np.transpose(S_between) + S_between)/2

    # take the absolute values of diagonal
    s_within = np.absolute(S_within.diagonal())
    s_between = np.absolute(S_between.diagonal())
    s_between[s_between == 0] = 1e-14  # this number if from authors' code

    # preprocessing
    fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1]
    k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features])
    s_within = s_within[fs_idx[0:n_selected_features]]
    s_between = s_between[fs_idx[0:n_selected_features]]

    # iterate util converge
    count = 0
    while True:
        score = np.sort(s_between-k*s_within)[::-1]
        I = np.argsort(s_between-k*s_within)[::-1]
        idx = I[0:n_selected_features]
        old_k = k
        k = np.sum(s_between[idx])/np.sum(s_within[idx])
        if verbose:
            print('obj at iter {0}: {1}'.format(count+1, k))
        count += 1
        if abs(k - old_k) < 1e-3:
            break

    # get feature index, feature-level score and subset-level score
    feature_idx = fs_idx[I]
    feature_score = score
    subset_score = k

    return feature_idx, feature_score, subset_score
    # build the sparse affinity matrix W
    W = csc_matrix((G[:, 2], (G[:, 0], G[:, 1])), shape=(n_samples, n_samples))
    bigger = np.transpose(W) > W
    W = W - W.multiply(bigger) + np.transpose(W).multiply(bigger)
    print('Sparse Affinity Matrix:', W)

    ## Logging
    #    with open('output.txt', 'a') as f:
    #        print("W", file=f)
    #        print(W, file=f)

    ##Euclidean laplacian result
    numTrainData = trainData.values
    kwargs_W = {"metric": "euclidean", "neighbour_mode": "knn"}
    W = construct_W.construct_W(numTrainData, **kwargs_W)

    ## Calculate Laplacian Score
    score = lap_score.lap_score(numTrainData, W=W)
    print('Laplacian Score:', score)

    ## Logging
    with open('output.txt', 'a') as f:
        print("Laplacian Score", file=f)
        print(score, file=f)

    # Laplacian HEOM result hardcoded
    """score = np.array(
        [np.nan, np.nan, np.nan, np.nan, 0.25866548, 0.25866548, np.nan, 0.25946108, np.nan, np.nan, np.nan, np.nan,
         0.67265115, 0.73108302, np.nan, np.nan, np.nan, 0.86144223, np.nan, 0.6201575, np.nan, np.nan, np.nan, np.nan,
         np.nan, np.nan, np.nan, np.nan, np.nan, 0.8655987, 0.85803891, 0.87968564, 0.88995775, 0.87647355, 0.86576088,
Пример #33
0
def ndfs(X, **kwargs):
    """
    This function implement unsupervised feature selection using nonnegative spectral analysis, i.e.,
    min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2
    s.t. F >= 0
    
    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape {n_samples, n_samples}
            affinity matrix
        alpha: {float}
            Parameter alpha in objective function
        beta: {float}
            Parameter beta in objective function
        gamma: {float}
            a very large number used to force F^T F = I
        F0: {numpy array}, shape (n_samples, n_clusters)
            initialization of the pseudo label matirx F, if not provided
        n_clusters: {int}
            number of clusters
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix
        
    Reference: 
        Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012.
    """

    # default gamma is 10e8
    if 'gamma' not in kwargs:
        gamma = 10e8
    else:
        gamma = kwargs['gamma']
    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    if 'alpha' not in kwargs:
        alpha = 1
    else:
        alpha = kwargs['alpha']
    if 'beta' not in kwargs:
        beta = 1
    else:
        beta = kwargs['beta']
    if 'F0' not in kwargs:
        if 'n_clusters' not in kwargs:
            print >>sys.stderr, "either F0 or n_clusters should be provided"
        else:
            # initialize F
            n_clusters = kwargs['n_clusters']
            F = kmeans_initialization(X, n_clusters)
    else:
        F = kwargs['F0']
    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']
    
    n_samples, n_features = X.shape

    # initialize D as identity matrix
    D = np.identity(n_features)
    I = np.identity(n_samples)

    # build laplacian matrix
    L = np.array(W.sum(1))[:, 0] - W

    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update W
        T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D + 1e-6*np.eye(n_features))
        W = np.dot(np.dot(T, X.transpose()), F)
        # update D
        temp = np.sqrt((W*W).sum(1))
        temp[temp < 1e-16] = 1e-16
        temp = 0.5 / temp
        D = np.diag(temp)
        # update M
        M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose()))
        M = (M + M.transpose())/2
        # update F
        denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F)
        temp = np.divide(gamma*F, denominator)
        F = F*np.array(temp)
        temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16))))
        F = np.dot(F, temp)

        # calculate objective function
        obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro')
        if verbose:
            print('obj at iter ' + str(iter_step+1) + ': ' + str(obj[iter_step]))

        if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
            break
    return W
Пример #34
0
def select(dataset, features_number, clusters_number):

    app_logger.info(
        'STARTED [MCFS Selection] on {0} with features number = {1}'.format(
            dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d')

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    # Building matrix W for MCFS algorithm
    kwargs = {
        'metric': 'euclidean',
        'neighbor_mode': 'knn',
        'weight_mode': 'binary',
        'k': 3
        # 'weight_mode': 'heat_kernel',
        # 'k': 5,
        # 't': 1
    }
    W = construct_W.construct_W(indipendent_columns_train.values, **kwargs)

    # MCFS gives a weight to each features
    kwargs = {'W': W, 'n_clusters': clusters_number}
    weighted_features = MCFS.mcfs(indipendent_columns_train.values,
                                  features_number, **kwargs)

    # Ordering the features according to their weight
    ordered_features = MCFS.feature_ranking(weighted_features)

    # Getting only the first 'features_number' features
    selected_features = ordered_features[0:features_number]

    # Getting names of selected features
    names_selected_features = []
    for feature_index in selected_features:
        names_selected_features.append(
            indipendent_columns_train.columns[feature_index])

    # Selected only the selected features on the train set
    selected_features_train = indipendent_columns_train.loc[:,
                                                            names_selected_features]
    app_logger.info('Selected features trainset: {0}'.format(
        selected_features_train.shape),
                    extra=LOGGER_EXTRA_OBJECT)

    # Selected only the selected features on the test set
    selected_features_test = indipendent_columns_test.loc[:,
                                                          names_selected_features]
    app_logger.info('Selected features testset: {0}'.format(
        selected_features_test.shape),
                    extra=LOGGER_EXTRA_OBJECT)
    '''
    # Pickles for rfd
    if selected_features_train.shape[0] > 1000:
        print('Test-set')
        selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    else:
        print('Train-set')
        selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    exit()
    '''

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'MCFS', features_number, dataset, selected_features_train.values,
        selected_features_test.values, clusters_number, known_labels_test)

    app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)


# Testing
#select('TwoPatterns', 10, 4)
Пример #35
0
def trace_ratio(X, y, n_selected_features=None, mode='rank', **kwargs):
    """
    This function implements the trace ratio criterion for feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        style: {string}
            style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way
            style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, False if not

    Output
    ------
    feature_idx: {numpy array}, shape (n_features,)
        the ranked (descending order) feature index based on subset-level score
    feature_score: {numpy array}, shape (n_features,)
        the feature-level score
    subset_score: {float}
        the subset-level score

    Reference
    ---------
    Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008.
    """
    if n_selected_features is None:
        n_selected_features = X.shape[1]
    # if 'style' is not specified, use the fisher score way to built two affinity matrix
    if 'style' not in list(kwargs.keys()):
        kwargs['style'] = 'fisher'
    # get the way to build affinity matrix, 'fisher' or 'laplacian'
    style = kwargs['style']
    n_samples, n_features = X.shape

    # if 'verbose' is not specified, do not output the value of objective function
    if 'verbose' not in kwargs:
        kwargs['verbose'] = False
    verbose = kwargs['verbose']

    if style is 'fisher':
        kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        L_within = np.eye(n_samples) - W_within
        L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples
        L_between = L_within - L_tmp

    if style is 'laplacian':
        kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        D_within = np.diag(np.array(W_within.sum(1))[:, 0])
        L_within = D_within - W_within
        W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within)
        D_between = np.diag(np.array(W_between.sum(1)))
        L_between = D_between - W_between

    # build X'*L_within*X and X'*L_between*X
    L_within = (np.transpose(L_within) + L_within)/2
    L_between = (np.transpose(L_between) + L_between)/2
    S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X))
    S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X))

    # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X'
    S_within = (np.transpose(S_within) + S_within)/2
    # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X'
    S_between = (np.transpose(S_between) + S_between)/2

    # take the absolute values of diagonal
    s_within = np.absolute(S_within.diagonal())
    s_between = np.absolute(S_between.diagonal())
    s_between[s_between == 0] = 1e-14  # this number if from authors' code

    # preprocessing
    fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1]
    k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features])
    s_within = s_within[fs_idx[0:n_selected_features]]
    s_between = s_between[fs_idx[0:n_selected_features]]

    # iterate util converge
    count = 0
    while True:
        score = np.sort(s_between-k*s_within)[::-1]
        I = np.argsort(s_between-k*s_within)[::-1]
        idx = I[0:n_selected_features]
        old_k = k
        k = np.sum(s_between[idx])/np.sum(s_within[idx])
        if verbose:
            print('obj at iter {0}: {1}'.format(count+1, k))
        count += 1
        if abs(k - old_k) < 1e-3:
            break

    # get feature index, feature-level score and subset-level score
    feature_idx = fs_idx[I]
    feature_score = score
    subset_score = k
    
    if mode == 'raw':
        return feature_idx, feature_score, subset_score
    elif mode == 'index':
        return feature_idx
    else:
        return reverse_argsort(feature_idx)
Пример #36
0
        print(count)
        X_train, X_test = features[train_index], features[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        start_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        acc = []

        # lap_score
        method = 'lap_score'
        kwargs_W = {
            "metric": "euclidean",
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 5,
            't': 1
        }
        W = construct_W.construct_W(X_train, **kwargs_W)
        score = lap_score.lap_score(X_train, W=W)
        idx = lap_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # fisher_score
        score = fisher_score.fisher_score(X_train, y_train)
        idx = fisher_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))
Пример #37
0
import urllib2


# URL for the Pima Indians Diabetes dataset (UCI Machine Learning Repository)
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file

raw_data = urllib2.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
X = dataset[:,:9]
y = dataset[:,8]


kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":5,'t':1}
W = construct_W.construct_W(X, **kwargs_W)
from skfeature.function.similarity_based import lap_score
score = lap_score.lap_score(X, W=W)
print score
idx = lap_score.feature_ranking(score)

fig = plt.figure()
plt.plot(score, label='Laplacian Score')

plt.legend(loc='upper middle', shadow=True)
plt.show()
print idx
num_fea = 3

#selected_features = X[:, idx[0:num_fea]]
#print selected_features