예제 #1
0
def fa_dim_red(x_train_scaled, dataset_name, features_num = 2):
    z=0
    losses = []
    for k in range(1, x_train_scaled.shape[1]+1):
        fa = FeatureAgglomeration(n_clusters=k)
        fa_result = fa.fit_transform(x_train_scaled)
        x_projected_fa = fa.inverse_transform(fa_result)
        loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
        losses.append(loss)
            
    np_feature_losses_percent = np.multiply(100, losses/np.sum(losses))
    print('num of clustrs < 10% loss')
    for i in range(len(np_feature_losses_percent)):
        z=z+np_feature_losses_percent[i]
        if z>90:
            print(i+1)
            break
    print(np_feature_losses_percent)
    plt.bar(list(range(1,len(np_feature_losses_percent)+1)),np_feature_losses_percent)
    plt.title("FeatureAgglomeration Projection Losses % ("+str(dataset_name)+")")
    plt.ylabel("Mean Squared Error (% of Total)")
    plt.xlabel("Features")
    plt.savefig((str(dataset_name))+' fa analysis.png')
    plt.show()

    fa = FeatureAgglomeration(n_clusters=features_num)
    fa_result = fa.fit_transform(x_train_scaled, y_train)
    print(fa_result.shape)
    x_projected_fa = fa.inverse_transform(fa_result)
    print(x_projected_ica.shape)
    print(x_train_scaled.shape)
    loss = ((x_train_scaled - x_projected_fa) ** 2).mean()
    print('loss')
    print(loss)
    return fa_result,x_projected_fa
    def cluster(self, cluster_images, cluster_db_path, diffnet_paht='model_checkpoints/', save_csv=False):
        compressions = []

        # Finding features
        diffnet = DiffNet(self.db, db_path=self.db_path)
        diffnet.restore(diffnet_paht)
        print("Calculating features for", len(cluster_images), "images")
        for img in cluster_images:
            print("Finding features for:", img)
            one_hot = diffnet.feedforward(img, cluster_db_path)
            output = self.sess.run(self.compressed, feed_dict={self.Q: [one_hot], self.keep_prob: 1.})
            compressions.append(output[0])

        # Clustering
        print("Performing clustering...")
        compressions = np.array(compressions)
        fa = FeatureAgglomeration(n_clusters=30)
        X_clusters = fa.fit_transform(compressions)

        print("Collecting data...")
        csv_dict_arr = []
        for i, img in enumerate(cluster_images):
            csv_dict_arr.append({'1.img': img, '2.class': np.argmax(X_clusters[i]), '3.features': compressions[i]})

        # Saving
        if save_csv:
            print("Saving data to csv...")
            keys = load_label_list(csv_dict_arr[0])
            with open('cluster_result.csv', 'w') as output_file:
                dict_writer = csv.DictWriter(output_file, keys, delimiter=';')
                dict_writer.writeheader()
                dict_writer.writerows(csv_dict_arr)

        return csv_dict_arr
예제 #3
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = Pipeline([
            ("RF", RandomForestRegressor(n_estimators=200, max_depth=15,
                                         n_jobs=N_JOBS))])
        self.scaler = StandardScaler()
        self.agglo = FeatureAgglomeration(n_clusters=500)

    def fit(self, X, y):
        y = y.ravel()
        n_samples, n_lags, n_lats, n_lons = X.shape
        self.scaler.fit(X[:, -1].reshape(n_samples, -1))
        X = X.reshape(n_lags * n_samples, -1)
        connectivity = grid_to_graph(n_lats, n_lons)
        self.agglo.connectivity = connectivity
        X = self.scaler.transform(X)
        X = self.agglo.fit_transform(X)
        X = X.reshape(n_samples, -1)
        self.clf.fit(X, y)

    def predict(self, X):
        n_samples, n_lags, n_lats, n_lons = X.shape
        X = X.reshape(n_lags * n_samples, -1)
        X = self.scaler.transform(X)
        X = self.agglo.transform(X)
        X = X.reshape(n_samples, -1)
        return self.clf.predict(X)
class AgglomerativeFeatureTransformer:
    def __init__(self, n_clusters=2):
        self.model = FeatureAgglomeration(n_clusters=n_clusters)

    def __call__(self, data):
        if data is None or data.shape[0] == 0:
            return None
        return self.model.fit_transform(data)
예제 #5
0
    def do_feature_agglomoration(self, data):
        print("Using feature agglomoration to reduce the matrix' dimensionality...")
        if self.k:
            n = self.k
        else:
            n = 20

        agglo = FeatureAgglomeration(n_clusters=n, affinity="cosine", linkage="complete")
        return agglo.fit_transform(data)
    def token_cluster(self, n_clusters=300):

        from scipy import sparse
        from sklearn.cluster import FeatureAgglomeration

        FA = FeatureAgglomeration(n_clusters=3000)

        self.bow_corpus = FA.fit_transform(self.bow_corpus)
        self.bow_corpus = sparse.csr_matrix(self.bow_corpus)
예제 #7
0
def apply_feature_agglomeration(table, features, label, n_components):
    from sklearn.cluster import FeatureAgglomeration
    from paje import feature_file_processor

    x, y = feature_file_processor.split_features_target(table, features, label)

    fa = FeatureAgglomeration(n_clusters=n_components, linkage='ward')
    pc = fa.fit_transform(x)

    return feature_file_processor.generate_data_frame(pc, table[[label]])
def dim_reduction_FA(data, distance_threshold=0.45):
    """
    Params:
        data: ndarry of shape (n_samples, n_features)
        distance_threshold: Optimal threshold value for similarity measure
    Returns: (reducedDimData, nReducedComponents)
    """
    agglo = FeatureAgglomeration(n_clusters=None,distance_threshold=distance_threshold)
    reducedDimData = agglo.fit_transform(data)
    return reducedDimData, agglo.n_clusters_
예제 #9
0
def getDR(dt_all, n_comp=12):
    # cols
    cols_encode_label = dt_all.filter(
        regex="Encode_Label").columns.values.tolist()
    cols_cat = dt_all.drop(
        "ID", axis=1).select_dtypes(include=["object"]).columns.tolist()

    # standardize
    dt_all_norm = MinMaxScaler().fit_transform(
        dt_all.drop(["y", "Fold"] + cols_cat + cols_encode_label, axis=1))

    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
    tsvd_results = tsvd.fit_transform(dt_all_norm)

    # PCA
    pca = PCA(n_components=n_comp, random_state=420)
    pca_results = pca.fit_transform(dt_all_norm)

    # ICA
    ica = FastICA(n_components=n_comp, max_iter=5000, random_state=420)
    ica_results = ica.fit_transform(dt_all_norm)

    # GRP
    grp = GaussianRandomProjection(n_components=n_comp,
                                   eps=0.1,
                                   random_state=420)
    grp_results = grp.fit_transform(dt_all_norm)

    # SRP
    srp = SparseRandomProjection(n_components=n_comp,
                                 dense_output=True,
                                 random_state=420)
    srp_results = srp.fit_transform(dt_all_norm)

    # NMF
    nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
    nmf_results = nmf.fit_transform(dt_all_norm)

    # F*G
    f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
    fag_results = f*g.fit_transform(dt_all_norm)

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        dt_all['DR_TSVD_' + str(i)] = tsvd_results[:, i - 1]
        dt_all['DR_PCA_' + str(i)] = pca_results[:, i - 1]
        dt_all['DR_ICA_' + str(i)] = ica_results[:, i - 1]
        dt_all['DR_GRP_' + str(i)] = grp_results[:, i - 1]
        dt_all['DR_SRP_' + str(i)] = srp_results[:, i - 1]
        dt_all['DR_NMF_' + str(i)] = nmf_results[:, i - 1]
        dt_all['DR_FAG_' + str(i)] = fag_results[:, i - 1]

    return (dt_all)
예제 #10
0
    def feat_agglom(self, n_clusters, standardize=True):
        """ Method for running feature agglomeration

        Args:
        df (Dataframe)
        target_var (String)
        n_clusters (Integer)
        Standardize (Boolean)

        Attributes:
        df:  Pandas dataframe containing the target and feature variables
        target_var:  The target variable
        n_clusters:  Number of clusters to return

        Returns:
        Pandas dataframe containing the feature name and the cluster number

        """

        df = self.df

        cat_features = df.loc[:, df.dtypes == object]

        if not cat_features.empty:
            df = self.prep_cat_vars(df)

        X = df.drop([self.target_var], axis=1)
        X_df = X
        X = X_df.values

        if standardize == True:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        agglo = FeatureAgglomeration(n_clusters=n_clusters)
        clusters = agglo.fit_transform(X)

        cluster_numbers = pd.DataFrame(agglo.labels_)
        feat_labels = pd.DataFrame(X_df.columns)
        var_clust = feat_labels.merge(cluster_numbers,
                                      left_index=True,
                                      right_index=True)
        var_clust.columns = ['Feature_Label', 'Cluster_Number']
        var_clust.sort_values('Cluster_Number', inplace=True)

        return var_clust
예제 #11
0
    def feat_agglom(self, n_clusters, standardize=True):
        """Function for running recursive feature elimination

        Parameters
        ----------
        n_clusters : int
            Number of clusters to create from the variables
        standardize : bool
            Boolean identiying if features should be standardized before clustering

        Returns
        -------
        var_clust : pandas.DataFrame
            Pandas DataFrame containing the feature name and the cluster number
        """

        df = self.df

        cat_features = df.loc[:, df.dtypes == object]

        if not cat_features.empty:
            df = self.prep_cat_vars(df)

        X = df.drop([self.target_var], axis=1)
        X_df = X
        X = X_df.values

        if standardize == True:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        agglo = FeatureAgglomeration(n_clusters=n_clusters)
        clusters = agglo.fit_transform(X)

        cluster_numbers = pd.DataFrame(agglo.labels_)
        feat_labels = pd.DataFrame(X_df.columns)
        var_clust = feat_labels.merge(
            cluster_numbers, left_index=True, right_index=True
        )
        var_clust.columns = ["features", "Cluster_Number"]
        var_clust.sort_values("Cluster_Number", inplace=True)

        return var_clust
예제 #12
0
                                        1.4], [7.7, 3., 6.1, 2.3],
                 [6.3, 3.4, 5.6, 2.4], [6.4, 3.1, 5.5,
                                        1.8], [6., 3., 4.8, 1.8],
                 [6.9, 3.1, 5.4, 2.1], [6.7, 3.1, 5.6, 2.4],
                 [6.9, 3.1, 5.1, 2.3], [5.8, 2.7, 5.1, 1.9],
                 [6.8, 3.2, 5.9, 2.3], [6.7, 3.3, 5.7,
                                        2.5], [6.7, 3., 5.2, 2.3],
                 [6.3, 2.5, 5., 1.9], [6.5, 3., 5.2, 2.], [6.2, 3.4, 5.4, 2.3],
                 [5.9, 3., 5.1, 1.8]])
print('Original shape: {}\n'.format(data.shape))
print('First 10:\n{}\n'.format(repr(data[:10])))

from sklearn.cluster import FeatureAgglomeration

agg = FeatureAgglomeration(n_clusters=2)
new_data = agg.fit_transform(data)
print('New shape: {}\n'.format(new_data.shape))
print('First 10:\n{}\n'.format(repr(new_data[:10])))
'''
riginal shape: (150, 4)

First 10:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
def elbowHeuristic_FA(data,
                      markX=None,
                      markY=None,
                      annotX=None,
                      annotY=None,
                      figPath=None):
    """
    Given the data on which PCA needs to be implemented, this function will
    plot the curve for elbow heuristics. Pass annotation parameters after manual
    inspection for final graph.
    Params:
        data: ndarry of shape (n_samples, n_features)
            - data on which FA needs to be performed
        markX: float
            - x-coordinate of the elbow point
        markY: float
            - y-coordinate of the elbow point
        annotX: float
            - x-coordinate where annotation text needs to be placed
        annotY: float
            - y-coordinate where annotation text needs to be placed
        figPath: string
            - Path where figure needs to be stored. If None, the figure is
              plotted in console itself.
    Returns: None
    """
    distThr = []
    nClusters = []
    for i in np.arange(0, 1, 0.05):
        distThr.append(i)
        agglo = FeatureAgglomeration(n_clusters=None, distance_threshold=i)
        clustering = agglo.fit_transform(data)
        nClusters.append(clustering.shape[1])
    plt.rc('font', size=15)  # controls default text sizes
    plt.rc('axes', titlesize=17)  # fontsize of the axes title
    plt.rc('axes', labelsize=17)  # fontsize of the x and y labels
    plt.rc('xtick', labelsize=15)  # fontsize of the tick labels
    plt.rc('ytick', labelsize=15)  # fontsize of the tick labels
    plt.rc('legend', fontsize=15)  # legend fontsize
    plt.rc('figure', titlesize=17)  # fontsize of the figure title
    plt.figure(figsize=(10, 6))
    plt.plot(distThr, nClusters)
    if markX is not None and markY is not None:
        plt.hlines(markY, 0, markX, linestyles='dashed')
        plt.vlines(markX, 0, markY, linestyles='dashed')
        plt.scatter([markX], [markY], c='r')
        '''
        plt.annotate('Elbow point\n'+str((np.around(markX,3), 7)), xy=(markX, markY+0.2),
                     xytext=(0.65, 15), arrowprops=dict(arrowstyle="->",
                     connectionstyle="angle3,angleA=0,angleB=-90"));
        '''
        if annotX is not None and annotY is not None:
            if annotY > markY:
                plt.annotate('Elbow point\n' + str(
                    (np.around(markX, 3), markY)),
                             xy=(markX, markY + 0.2),
                             xytext=(annotX, annotY),
                             arrowprops=dict(
                                 arrowstyle="->",
                                 connectionstyle="angle3,angleA=0,angleB=-90"))
            else:
                plt.annotate('Elbow point\n' + str(
                    (np.around(markX, 3), markY)),
                             xy=(markX, markY - 0.2),
                             xytext=(annotX, annotY),
                             arrowprops=dict(
                                 arrowstyle="->",
                                 connectionstyle="angle3,angleA=0,angleB=-90"))
    plt.xlim(0, 1)
    plt.ylim(0, 27)
    plt.xlabel("Distance Threshold")
    plt.ylabel("Number of Clusters - Reduced Dimensions")
    if figPath is not None:
        plt.savefig(figPath, bbox_inches='tight', pad_inches=0.05)
        plt.close()
예제 #14
0
X_test_pen = pen_scaler.transform(X_test_pen)

sat_scaler.fit(X_test_sat) 
X_test_sat = sat_scaler.transform(X_test_sat)

y_train_pen = [float(num) for num in y_train_pen]
sat_params = [2,10,18]
pen_params = [2,8,14]

from sklearn.cluster import FeatureAgglomeration
for i in range(3):
    sat_fa = FeatureAgglomeration(n_clusters=sat_params[i])
    pen_fa = FeatureAgglomeration(n_clusters=pen_params[i])


    X_train_sat = sat_fa.fit_transform(X_train_sat_og)   
    X_train_pen = pen_fa.fit_transform(X_train_pen_og)

    plt.figure(figsize=(12, 6))  
    plt.scatter(X_train_sat[:,0], X_train_sat[:,1], c=y_train_sat)
    plt.title('Plot of Top Two Features - Feature Agglomeration of '+str(sat_params[i])+' Features (Satellite)')  
    plt.xlabel('Feature 1')  
    plt.ylabel('Feature 2')
    plt.show()
    plt.figure(figsize=(12, 6))
    plt.scatter(X_train_pen[:,0], X_train_pen[:,1], c=y_train_pen)
    plt.title('Plot of Top Two Features - Feature Agglomeration of '+str(pen_params[i])+' Features (Digits)')  
    plt.xlabel('Feature 1')  
    plt.ylabel('Feature 2')
    plt.show()
예제 #15
0
            do_tsne = True
            if [do_agglomeration, do_pca, do_tsne].count(True) != 1:
                logger.warn(
                    'Can do exactly one of FeatureAgglomeration, PCA, t-SNE.')
                quit()

            # todo add a test to make sure we do one of these
            if False:
                pass
            elif do_agglomeration:
                model = FeatureAgglomeration()
                if False:
                    pass
                elif do_lda:
                    try:
                        scatter_points = model.fit_transform(lda_results)
                    except AssertionError as assertionError:
                        logger.warn('%s : %s', (input_file, assertionError))
                elif do_lsa:
                    scatter_points = model.fit_transform(lsa_results)
                elif do_nmf:
                    scatter_points = model.fit_transform(nmf_results)
            elif do_pca:
                pass
                model = PCA(n_components=2, random_state=random_state)
                if False:
                    pass
                elif do_lda:
                    try:
                        scatter_points = model.fit_transform(lda_results)
                    except AssertionError as assertionError:
예제 #16
0
grp_train = grp.fit_transform(x_train)
grp_test = grp.transform(x_test)

# SRP
srp = SparseRandomProjection(n_components=remaining_comp, dense_output=True, random_state=420)
srp_train = srp.fit_transform(x_train)
srp_test = srp.transform(x_test)

# NMF
nmf = NMF(n_components=remaining_comp, init='nndsvdar', random_state=420)
nmf_train = nmf.fit_transform(x_train)
nmf_test = nmf.transform(x_test)

# F*G
f*g = FeatureAgglomeration(n_clusters=remaining_comp, linkage='ward')
fag_train = f*g.fit_transform(x_train)
fag_test = f*g.transform(x_test)

# for i in range(1, remaining_comp + 1):
#     x_train['pca_' + str(i)] = pca_train[:, i - 1]
#     x_test['pca_' + str(i)] = pca_test[:, i - 1]
#
#     x_train['ica_' + str(i)] = ica_train[:, i - 1]
#     x_test['ica_' + str(i)] = ica_test[:, i - 1]
#
#     x_train['tsvd_' + str(i)] = tsvd_train[:, i - 1]
#     x_test['tsvd_' + str(i)] = tsvd_test[:, i - 1]
#
#     x_train['grp_' + str(i)] = grp_train[:, i - 1]
#     x_test['grp_' + str(i)] = grp_test[:, i - 1]
#
    data.append(str)
    labels.append(arr[0])

vectorizer = TfidfVectorizer(max_df=0.15,
                             min_df=1,
                             stop_words='english',
                             use_idf=True,
                             smooth_idf=True,
                             norm='l2')
X = vectorizer.fit_transform(data)

ftrCluster = FeatureAgglomeration(n_clusters=20,
                                  affinity='euclidean',
                                  compute_full_tree='auto',
                                  linkage='ward')
fittans = ftrCluster.fit_transform(X.toarray())

np.savetxt(
    '/users/grad/rakib/dr.norbert/dataset/shorttext/biomedical/semisupervised/hacftrtfidf_20',
    fittans,
    delimiter=' ',
    fmt='%.10f')

############

from sklearn.cluster import KMeans
import numpy as np
import sys
import math
from sklearn import svm
import collections
예제 #18
0
    ica1 = FastICA(n_components=20)
    data1_X_ica = ica1.fit_transform(data1_X_train)
    data1_X_ica_test = ica1.transform(data1_X_test)
    ica2 = FastICA(n_components=90)
    data2_X_ica = ica2.fit_transform(data2_X_train)
    data2_X_ica_test = ica2.transform(data2_X_test)

    grp1 = GaussianRandomProjection(n_components=20)
    data1_X_grp = grp1.fit_transform(data1_X_train)
    data1_X_grp_test = grp1.transform(data1_X_test)
    grp2 = GaussianRandomProjection(n_components=90)
    data2_X_grp = grp2.fit_transform(data2_X_train)
    data2_X_grp_test = grp2.transform(data2_X_test)

    fa1 = FeatureAgglomeration(n_clusters=20)
    data1_X_fa = fa1.fit_transform(data1_X_train)
    data1_X_fa_test = fa1.transform(data1_X_test)
    fa2 = FeatureAgglomeration(n_clusters=90)
    data2_X_fa = fa2.fit_transform(data2_X_train)
    data2_X_fa_test = fa2.transform(data2_X_test)
    ''' clustering '''

    clusters = np.logspace(0.5,
                           2,
                           num=10,
                           endpoint=True,
                           base=10.0,
                           dtype=None)
    for i in range(0, len(clusters)):
        clusters[i] = int(clusters[i])
    print clusters
# dfs = df[(df[:, 0] == 'A') | (df[:, 0] == 'B') | (df[:, 0] == 'C') | (df[:, 0] == 'D') |
#          (df[:, 0] == 'E') | (df[:, 0] == 'F') | (df[:, 0] == 'G') | (df[:, 0] == 'H') |
#          (df[:, 0] == 'I') | (df[:, 0] == 'J') | (df[:, 0] == 'K') | (df[:, 0] == 'L') |
#          (df[:, 0] == 'M') | (df[:, 0] == 'N') | (df[:, 0] == 'O')]
data = df[:, 1:]
data = scale(data)
labels = df[:, 0]
n_digits = len(np.unique(labels))

print(data.shape)
print(data.std())
std_transf = np.zeros(15)
x_axis = range(2, 17)
for i in x_axis:
    transformer = FeatureAgglomeration(n_clusters=i)
    rp = transformer.fit_transform(data)
    print(rp.shape)
    std_transf[i - 2] = rp.std()
plt.plot(x_axis, std_transf)
plt.ylabel('STD of data')
plt.xlabel('n_components')
plt.title("STD Vs Dimenions For Feature Agglomeration")
plt.show()

# ###############################################################################
# # Visualize the results on PCA-reduced data
#
# reduced_data = PCA(n_components=2).fit_transform(data)
# kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
# kmeans.fit(reduced_data)
#
예제 #20
0
# SRP
srp = SparseRandomProjection(n_components=n_comp,
                             dense_output=True,
                             random_state=420)
srp_results_train = srp.fit_transform(train)
srp_results_test = srp.transform(test)

# NMF
nmf = NMF(n_components=n_comp, init='nndsvdar', random_state=420)
nmf_results_train = nmf.fit_transform(train)
nmf_results_test = nmf.transform(test)

# F*G
f*g = FeatureAgglomeration(n_clusters=n_comp, linkage='ward')
fag_results_train = f*g.fit_transform(train)
fag_results_test = f*g.transform(test)

# ### Filtering the most significant components and inserting in a Dataframe ###

# In[ ]:

dim_reds = list()
train_pca = pd.DataFrame()
test_pca = pd.DataFrame()

train_ica = pd.DataFrame()
test_ica = pd.DataFrame()

train_tsvd = pd.DataFrame()
test_tsvd = pd.DataFrame()
예제 #21
0
for component in range(1, len(X_train[0])+1):
    grp = GaussianRandomProjection(n_components=component, random_state=1)
    X_train_reduced = grp.fit_transform(X_train)
    X_test_reduced = grp.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)  
    knn.fit(X_train_reduced, y_train)
    train_scores.append(knn.score(X_train_reduced, y_train))
    test_scores.append(knn.score(X_test_reduced, y_test))
if dataset_name=='spam':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Spam dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))
elif dataset_name=='letter':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Randomized Projected Components (Letter Recognition dataset)', 'Number of Projected Components', 'Accuracy', ['projected train score','projected test score'], range(1, len(X_train[0])+1))

#FA
train_scores=[]
test_scores=[]
for component in range(1, len(X_train[0])+1):
    fa = FeatureAgglomeration(n_clusters=component)
    X_train_reduced = fa.fit_transform(X_train)
    X_test_reduced = fa.transform(X_test)

    knn = KNeighborsClassifier(n_neighbors=3)  
    knn.fit(X_train_reduced, y_train)
    train_scores.append(knn.score(X_train_reduced, y_train))
    test_scores.append(knn.score(X_test_reduced, y_test))
if dataset_name=='spam':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Spam dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
elif dataset_name=='letter':
    drawMultiple([train_scores, test_scores], 'KNN Accuracy over Feature Agglomeration Components (Letter Recognition dataset)', 'Number of Agglomerated Components', 'Accuracy', ['Agglomerated train score','Agglomerated test score'], range(1, len(X_train[0])+1))
예제 #22
0
파일: ROC-curve.py 프로젝트: jche/GumGum
from sklearn.cross_validation import KFold
from sklearn import metrics

###############################################################################
# Data IO and generation
# import some data to play with
#file = "/home/kbhalla/Desktop/Data/day_samp_new.npy"
file = "/home/rmendoza/Documents/Data/day_samp_new_0604.npy"
with open(file, "r") as file_in:
        matrix = smio.load_sparse_csr(file_in)

X = matrix[:,:-1]
FA = FeatureAgglomeration(n_clusters=250)
print np.shape(X)
y = matrix[:,-1]
X = FA.fit_transform(X,y)
n_samples, n_features = X.shape
k = int(0.8*n_samples)
#random_state = np.random.RandomState(0)
#X = np.c_[X, random_state.randn(n_samples, 2*n_features)]
X_test, y_test = X[k:,:], y[k:]
X, y = X[:k, :], y[:k]
sm = SMOTE(ratio=0.95)
X,y = sm.fit_sample(X, y)
print np.shape(X)
start = time.time()




###############################################################################
lecs=data.iloc[:,:-6]
from sklearn.cluster import FeatureAgglomeration
agglo = FeatureAgglomeration(n_clusters=15)


# In[ ]:


agglo.fit(lecs)


# In[ ]:


lecs_reduced=agglo.fit_transform(lecs)


# In[ ]:


lecs_reduced.shape


# In[ ]:


from sklearn.decomposition import PCA, KernelPCA


# In[ ]:
예제 #24
0
plt.ylabel('Y')
plt.title('AgglomerativeClustering', fontdict=dict(size=20, color='r'))

model4 = Birch(threshold=0.5, branching_factor=50, n_clusters=4)
model4.fit(x)
print('\nBirch:')
print(model4.subcluster_centers_.shape)
ypred4 = model4.predict(x)
plt.figure(figsize=(12, 8))
plt.scatter(x[:,0], x[:,1], c=ypred4, cmap='Spectral')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Hierarchical_Birch', fontdict=dict(size=20, color='r'))

model5 = FeatureAgglomeration(n_clusters=2, affinity='euclidean', linkage='complete')  # dimensionality reduction
x_new = model5.fit_transform(x1)
model6 = KMeans(n_clusters=4, max_iter=300, tol=0.0001, verbose=0, random_state=1, n_jobs=4)
ypred6 = model6.fit_predict(x_new)
plt.figure(figsize=(12, 8))
plt.scatter(x_new[:, 0], x_new[:, 1], c=ypred6, cmap='coolwarm')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('FeatureAgglomeration_KMeans', fontdict=dict(size=20, color='r'))

model7 = DBSCAN(eps=0.8, min_samples=5, metric='euclidean', leaf_size=30, n_jobs=4)
ypred7 = model7.fit_predict(x)
plt.figure(figsize=(12, 8))
plt.scatter(x[:, 0], x[:, 1], c=ypred7, cmap='seismic')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('DBSCAN', fontdict=dict(size=20, color='r'))
예제 #25
0
def ft_red_select(x,
                  y,
                  choice,
                  no_normalize,
                  dis_kept_features,
                  num_features=30):
    """
    :param 'full_file_name', which is the full path name to the file in question that we wish to do dimensionality
    reduction on
    :return: the new reduced 'x' and 'y' components of the file to be later written to a new file
    """

    #Normalize the data
    if not no_normalize:
        x = normalize(x)

    #Given the argument choice of feature selection/reduction, creates the relevant object, fits the 'x' data to it,
    #and reduces/transforms it to a lower dimensionality
    new_x = []
    print("Original 'x' shape:", np.shape(x))
    if choice == "pca":
        pca = PCA(n_components=num_features)
        new_x = pca.fit_transform(x)
        print("Explained variance = " +
              str(round(sum(pca.explained_variance_) * 100, 2)) + "%")
    elif choice == "grp":
        grp = GaussianRandomProjection(n_components=num_features)
        new_x = grp.fit_transform(x)
    elif choice == "agglom":
        agg = FeatureAgglomeration(n_clusters=num_features)
        new_x = agg.fit_transform(x)
    elif choice == "thresh":
        #Below threshold gives ~26 components upon application
        vt = VarianceThreshold(threshold=0.00015)
        new_x = vt.fit_transform(x)
        print("Explained variance = " +
              str(round(sum(vt.variances_) * 100, 2)) + "%")
        kept_features = list(vt.get_support(indices=True))
        if dis_kept_features:
            print("Kept features: ")
            for i in kept_features:
                print(col_names[i])
    elif choice == "rf":
        y_labels = [1 if s == "D" else 0 for s in y[:, 1]]
        clf = RandomForestClassifier(n_estimators=10000,
                                     random_state=0,
                                     n_jobs=-1)
        print("Fitting RF model....")
        clf.fit(x, y_labels)
        sfm = SelectFromModel(clf,
                              threshold=-np.inf,
                              max_features=num_features)
        print("Selecting best features from model...")
        sfm.fit(x, y_labels)
        kept_features = list(sfm.get_support(indices=True))
        if dis_kept_features:
            print("Kept features: ")
            for i in kept_features:
                print(col_names[i])
        new_x = x[:, kept_features]

    print("Reduced 'x' shape:", np.shape(new_x))
    return new_x, y
예제 #26
0
    error_rate_test_1 = np.zeros(np.shape(data1_X_train)[1])

    DT1 = tree.DecisionTreeClassifier(criterion='gini', min_samples_leaf=5, max_depth=None)

    error_rate_train_DT_1 = sum(
        DT1.fit(data1_X_train, data1_y_train).predict(data1_X_train) == data1_y_train) * 1.0 / data1_y_train.shape[0]
    print "error_rate_train_DT_1", error_rate_train_DT_1
    error_rate_test_DT_1 = sum(
        DT1.fit(data1_X_train, data1_y_train).predict(data1_X_test) == data1_y_test) * 1.0 / data1_y_test.shape[0]
    print "error_rate_test_DT_2", error_rate_test_DT_1

    for i in range(0, np.shape(data1_X_train)[1]):
        print i
        start_time = time.time()
        fa.set_params(n_clusters=i + 1)
        data1_X_train_fa = fa.fit_transform(data1_X_train)
        data1_X_test_fa = fa.transform(data1_X_test)

        error_rate_train_1[i] = sum(
            DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_train_fa) == data1_y_train) * 1.0 / \
                                data1_y_train.shape[0]
        print("error_rate_train_1[%f]" % i), error_rate_train_1[i]
        error_rate_test_1[i] = sum(
            DT1.fit(data1_X_train_fa, data1_y_train).predict(data1_X_test_fa) == data1_y_test) * 1.0 / \
                               data1_y_test.shape[0]
        print("error_rate_test_1[%f]" % i), error_rate_test_1[i]
        print "time consumed:", time.time() - start_time

    file_2.write("FA_error_rate_train_1")
    for i in range(0, len(error_rate_train_1)):
        file_2.write(";")
예제 #27
0
def FA_reduced(X_train):
    fa = FeatureAgglomeration(n_clusters=10)
    X_train_reduced = fa.fit_transform(X_train)
    return X_train_reduced
예제 #28
0
def train_and_test(alpha,
                   predictors,
                   predictor_params,
                   x_filename,
                   y_filename,
                   n_users,
                   percTest,
                   featureset_to_use,
                   diff_weighting,
                   phi,
                   force_balanced_classes,
                   do_scaling,
                   optimise_predictors,
                   report,
                   conf_report=None):
    # all_X = numpy.loadtxt(x_filename, delimiter=",")
    all_X = numpy.load(x_filename + ".npy")
    all_y = numpy.loadtxt(y_filename, delimiter=",")

    print("loaded X and y files", x_filename, y_filename)

    if numpy.isnan(all_X.any()):
        print("nan in", x_filename)
        exit()

    if numpy.isnan(all_y.any()):
        print("nan in", y_filename)
        exit()

    #print("selecting balanced subsample")
    print("t t split")
    X_train, X_test, y_train, y_test = train_test_split(all_X,
                                                        all_y,
                                                        test_size=percTest,
                                                        random_state=666)

    # feature extraction
    # test = SelectKBest(score_func=chi2, k=100)
    # kb = test.fit(X_train, y_train)
    # # summarize scores
    # numpy.set_printoptions(precision=3)
    # print(kb.scores_)
    # features = kb.transform(X_train)
    # mask = kb.get_support()
    # # summarize selected features
    # print(features.shape)
    # X_train = X_train[:,mask]
    # X_test = X_test[:,mask]

    scaler = StandardScaler()
    rdim = FeatureAgglomeration(n_clusters=100)
    if do_scaling:
        # input(X_train.shape)
        X_train = rdim.fit_transform(X_train)
        X_test = rdim.transform(X_test)
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        with open('../../../isaac_data_files/qutor_scaler.pkl',
                  'wb') as output:
            pickle.dump(scaler, output, pickle.HIGHEST_PROTOCOL)
        with open('../../../isaac_data_files/qutor_rdim.pkl', 'wb') as output:
            pickle.dump(rdim, output, pickle.HIGHEST_PROTOCOL)

    # print("feature reduction...")
    # pc = PCA(n_components=100)
    # X_train = pc.fit_transform(X_train)
    # X_test = pc.transform(X_test)

    classes = numpy.unique(y_train)
    sample_weights = None
    if (force_balanced_classes):
        X_train, y_train = balanced_subsample(X_train, y_train, 1.0)  #0.118)

    print("X_train shape:", X_train.shape)
    print("X_test shape:", X_test.shape)

    print("tuning classifier ...")
    for ix, p in enumerate(predictors):
        print(type(p))
        print(p.get_params().keys())

        if optimise_predictors == True and len(predictor_params[ix]) > 1:
            pbest = run_random_search(p, X_train, y_train,
                                      predictor_params[ix])
        else:
            pbest = p.fit(X_train, y_train)
        predictors[ix] = pbest

    print("pickling classifier ...")
    for ix, p in enumerate(predictors):
        p_name = predictor_params[ix]['name']
        with open(
                '../../../isaac_data_files/p_{}_{}_{}.pkl'.format(
                    p_name, alpha, phi), 'wb') as output:
            pickle.dump(p, output, pickle.HIGHEST_PROTOCOL)
    print("done!")

    # report.write("* ** *** |\| \` | |  |) /; `|` / |_| *** ** *\n")
    # report.write("* ** *** | | /_ |^|  |) ||  |  \ | | *** ** *\n")
    #report.write("RUNS,P,FB,WGT,ALPHA,PHI,SCL,0p,0r,0F,0supp,1p,1r,1F,1supp,avg_p,avg_r,avg_F,#samples\n")
    for ix, p in enumerate(predictors):

        report.write(",".join(
            map(str, (all_X.shape[0], str(p).replace(",", ";").replace(
                "\n", ""), force_balanced_classes, diff_weighting, alpha, phi,
                      do_scaling))))

        y_pred_tr = p.predict(X_train)
        y_pred = p.predict(X_test)

        # for x,y,yp in zip(X_train, y_test, y_pred):

        if conf_report:
            conf_report.write(
                str(p).replace(",", ";").replace("\n", "") + "\n")
            conf_report.write(str(alpha) + "," + str(phi) + "\n")
            conf_report.write(str(confusion_matrix(y_test, y_pred)) + "\n")
            conf_report.write("\n")
        # p = precision_score(y_test, y_pred, average=None, labels=classes)
        # r = recall_score(y_test, y_pred, average=None, labels=classes)
        # F = f1_score(y_test, y_pred, average=None, labels=classes)
        p, r, F, s = precision_recall_fscore_support(y_test,
                                                     y_pred,
                                                     labels=classes,
                                                     average=None,
                                                     warn_for=('precision',
                                                               'recall',
                                                               'f-score'))
        avp, avr, avF, _ = precision_recall_fscore_support(
            y_test,
            y_pred,
            labels=classes,
            average='weighted',
            warn_for=('precision', 'recall', 'f-score'))
        for ix, c in enumerate(classes):
            report.write(",{},{},{},{},{},".format(c, p[ix], r[ix], F[ix],
                                                   s[ix]))
        report.write("{},{},{},{}\n".format(avp, avr, avF, numpy.sum(s)))

        # report.write(classification_report(y_test, y_pred)+"\n")
        # report.write("------END OF CLASSIFIER------\n")
        report.flush()
    return X_train, X_test, y_pred_tr, y_pred, y_test, scaler
예제 #29
0
def validate_spectral_clusters(clusterCenters,
                               labels,
                               originalData,
                               nEigenVectors,
                               partitions=4,
                               dimRedMethod=None,
                               trials=100):
    """
    Computes the cluster centroids from the given dataset and clustering labels.
    Params:
        clusterCenters: ndarray of shape (n_clusters, n_features)
            - cluster centers as assigned by the algorithm which needs to be
              validated
        labels: ndarray of shape (n_samples,)
            - labels assigned by the clustering algorithm to each household
        originalData: ndarray (shape determined by the problem)
            - original data which was used for pre-processing followed by
              dimensionality reduction before passing onto for final clustering
            - this will be passed straight to the methods:
              - preProcessing_clustering:: to get ndarray of shape (n_samples, n_features)
              - shuffle_partition:: to get list of arrays similar to originalData
        nEigenVectors: int
            - number of eigenvectors used during spectral clustering
        partitions: int (>1)
            - number of partitions (of the original data) to be studied
        dimRedMethod: 'FA' or 'PCA' or None
            - Dimensionality reduction method which was used post pre-processing
        trials: int (>=1)
            - Number of times partitioning is done before averaging out the results
    Returns:
        totalCases: int
            - total number of cases for which match/mis-match is calculated
        nMatchAvg: float
            - average number of matches across trials
        nMisMatchAvg: float
            - average number of mis-matches across trials
        percentMatch: float
            - match% obtained across trials
        percentMisMatch: float
            - mis-match% obtained across trials
        sampleMisMatchFreq: ndarray of shape (n_samples,)
            - Average number of mis-matches obatined for each sample after all
              trials
            - Note: In each trial, number of times a match/mis-match is calculated
              for a particular sample is equal to the number of partitions studied
              during the validation
    """
    nClusters = clusterCenters.shape[0]
    nComponents = clusterCenters.shape[1]
    sampleMisMatchFreq = np.zeros((len(labels)))
    nMatchAvg = 0
    nMisMatchAvg = 0
    for trial in range(trials):
        # Shuffle and Partition the data
        partitionedData = shuffle_partition(originalData, partitions)
        nMatch = 0
        nMisMatch = 0
        for i in range(len(partitionedData)):
            # Perform pre-processing routines on each partitions
            # This variables shape must be (n_samples, n_features)
            processedData = preProcessing_clustering(partitionedData[i])
            # Perform dimensionality reduction on pre-processed partitions
            if dimRedMethod == None:
                processedData_reduced = processedData
            elif dimRedMethod == 'PCA':
                if nComponents == None:
                    raise ValueError(
                        "nComponents cannot be None when dimRedMethod is not None."
                    )
                pca = PCA(n_components=nComponents)
                processedData_reduced = pca.fit_transform(processedData)
            elif dimRedMethod == 'FA':
                if nComponents == None:
                    raise ValueError(
                        "nComponents cannot be None when dimRedMethod is not None."
                    )
                agglo = FeatureAgglomeration(n_clusters=nComponents)
                processedData_reduced = agglo.fit_transform(processedData)
            else:
                raise ValueError(
                    "dimRedMethod should either be 'PCA' or 'FA' or None - found something else."
                )
            # Perform spectral clustering's internal processing steps before
            # internal implementation of K-Means algorithm
            _, U, _ = spectralClustering_KM_KNN_Euc(
                processedData_reduced, nClusters, nEigenVectors=nEigenVectors)
            # Check which cluster is nearest to the newly obtained vector
            # representaions of the same sample. Note: corresponding to each
            # sample, each partition specifies a new representation of the sample.
            # In other words, original sample is divided into n (n = number of
            # partitions) representations of itself.
            for sample in range(len(processedData_reduced)):
                temp = np.argmin(
                    np.linalg.norm(clusterCenters - U[sample].reshape(1, -1),
                                   axis=1))
                if temp == labels[sample]:
                    nMatch += 1
                else:
                    nMisMatch += 1
                    sampleMisMatchFreq[sample] += 1
        nMatchAvg = ((nMatchAvg * trial) + nMatch) / (trial + 1)
        nMisMatchAvg = ((nMisMatchAvg * trial) + nMisMatch) / (trial + 1)
    totalCases = int(np.round(nMatchAvg + nMisMatchAvg))
    percentMatch = (nMatchAvg * 100) / totalCases
    percentMisMatch = (nMisMatchAvg * 100) / totalCases
    return totalCases, nMatchAvg, nMisMatchAvg, percentMatch, percentMisMatch, sampleMisMatchFreq