def otherScikitImpl(data, orig_dimension, new_dimension):
    rp = GaussianRandomProjection(n_components=new_dimension)
    m = rp._make_random_matrix(new_dimension, orig_dimension)
    m = np.mat(m)
    reduced = m * np.mat(data).transpose()
    reduced = reduced.transpose()
    return reduced
def run_RCA(X,y,title):
    
    dims = list(np.arange(2,(X.shape[1]-1),3))
    dims.append(X.shape[1])
    tmp = defaultdict(dict)

    for i,dim in product(range(5),dims):
        print('round', i)
        rp = GRP(random_state=i, n_components=dim)
        tmp[dim][i] = pairwiseDistCorr(rp.fit_transform(X), X)
    tmp = pd.DataFrame(tmp).T
    mean_recon = tmp.mean(axis=1).tolist()
    std_recon = tmp.std(axis=1).tolist()


    fig, ax1 = plt.subplots()
    ax1.plot(dims,mean_recon, 'b-')
    ax1.set_xlabel('Random Components')
    # Make the y-axis label, ticks and tick labels match the line color.
    ax1.set_ylabel('Mean Reconstruction Correlation', color='b')
    ax1.tick_params('y', colors='b')
    plt.grid(False)

    ax2 = ax1.twinx()
    ax2.plot(dims,std_recon, 'm-')
    ax2.set_ylabel('STD Reconstruction Correlation', color='m')
    ax2.tick_params('y', colors='m')
    plt.grid(False)

    plt.title("Random Components for 5 Restarts: "+ title)
    fig.tight_layout()
    plt.show()
예제 #3
0
    def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
        scl = RobustScaler()
        X_train_scl = scl.fit_transform(X_train)

        ks = []
        for i in range(1000):
            ##
            ## Random Projection
            ##
            rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
            rp.fit(X_train_scl)
            X_train_rp = rp.transform(X_train_scl)

            ks.append(kurtosis(X_train_rp))

        mean_k = np.mean(ks, 0)

        ##
        ## Plots
        ##
        ph = plot_helper()

        title = 'Kurtosis (Randomized Projection) for ' + data_set_name
        name = data_set_name.lower() + '_rp_kurt'
        filename = './' + self.out_dir + '/' + name + '.png'

        ph.plot_simple_bar(np.arange(1,
                                     len(mean_k) + 1, 1), mean_k,
                           np.arange(1,
                                     len(mean_k) + 1, 1).astype('str'),
                           'Feature Index', 'Kurtosis', title, filename)
def plot_data(method, X, y, title, filename):

    fig, (ax1) = plt.subplots(1, 1)

    n_labels = len(y)

    if method == 'pca':
        t = decomposition.PCA(n_components=2)
        X = t.fit_transform(X)
    elif method == 'ica':
        t = decomposition.FastICA(n_components=2, whiten=True)
        X = t.fit_transform(X)
    elif method == 'rp':
        t = GaussianRandomProjection(n_components=2)
        X = t.fit_transform(X)

    np.random.seed(20)
    for label in np.unique(y):
        ax1.scatter(X[y == label, 0],
                    X[y == label, 1],
                    color=np.random.rand(3),
                    linewidths=1)

    ax1.set_title(title)
    ax1.grid()
    plt.tight_layout()

    plt.savefig('/'.join(['output', filename]))
    plt.close("all")
예제 #5
0
def test_output_transformer():
    X, y = datasets.make_multilabel_classification(return_indicator=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

    # Check that random_state are different
    transformer = GaussianRandomProjection(n_components=5, random_state=None)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)

        random_state = [
            sub.output_transformer_.random_state for sub in est.estimators_
        ]

        assert_equal(len(set(random_state)), est.n_estimators)

    # Check that random_state are equals
    transformer = FixedStateTransformer(
        GaussianRandomProjection(n_components=5), random_seed=0)
    for name, ForestEstimator in FOREST_ESTIMATORS.items():
        est = ForestEstimator(random_state=5, output_transformer=transformer)
        est.fit(X_train, y_train)
        y_pred = est.predict(X_test)
        assert_equal(y_pred.shape, y_test.shape)

        random_state = [
            sub.output_transformer_.random_state for sub in est.estimators_
        ]

        assert_equal(len(set(random_state)), 1)
        assert_equal(random_state[0], 0)
예제 #6
0
def gaussian_random_projection(A, k):
    """
    Gaussian random projection from sklearn.
    """
    transformer = GaussianRandomProjection(n_components=k)
    A_embedded = transformer.fit_transform(A)
    return A_embedded
예제 #7
0
def red_dim(X_tr, y_tr, *X_tests, meth, classif=True, nFeats=784, post_norm=False):
    X_tests_ = []
    if meth == 'UFS':
        # 1. UFS
        score_func = f_classif if classif else f_regression
        ufs = SelectKBest(score_func=score_func, k=nFeats)
        X_tr = ufs.fit_transform(X_tr, y_tr)
        for X_te in X_tests:
            X_tests_.append(ufs.transform(X_te))
    elif meth == 'RFE':
        # 2. RFE
        estim = SVC(kernel="linear", C=1) if classif else SVR(kernel="linear")
        rfe = RFE(estim, n_features_to_select=nFeats, step=0.10)
        rfe = rfe.fit(X_tr, y_tr)
        X_tr = X_tr[:, rfe.support_]
        for X_te in X_tests:
            X_tests_.append(X_te[:, rfe.support_])
    elif meth == 'GRP':
        # 3. GRP
        grp = GaussianRandomProjection(n_components=nFeats)
        X_tr = grp.fit_transform(X_tr, y_tr)
        for X_te in X_tests:
            X_tests_.append(grp.transform(X_te))
    else:
        print('Check Dim. Red. Method')
    if post_norm:
        logger.info("Applying post-normalization...")
        ss = StandardScaler().fit(X_tr)
        X_tr = ss.transform(X_tr)
        for i in range(len(X_tests_)):
            X_tests_[i] = ss.transform(X_tests_[i])

    logger.info('{} X_train {} '.format(meth, X_tr.shape) +
                ' '.join(['X_test ({}) {}'.format(i, X_te.shape) for i, X_te in enumerate(X_tests_)]))
    return X_tr, X_tests_
예제 #8
0
    def project_points(points, dim=None):
        if dim is None:
            dim = 5
            #dim = min(max(int(np.log(len(points))), 2), 15)

        proj = GaussianRandomProjection(n_components=dim)
        return proj.fit_transform(points)
def components(K):
    Sum_of_squared_distances = []
    k = []
    accuracy = []
    score = []
    for i in range(1, K):
        transformer = GaussianRandomProjection(n_components=i, eps=0.1)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_components=i,eps=0.6)
        X_new = transformer.fit_transform(X)
        #label=transformer.predict(X)
        km = KMeans(n_clusters=2, random_state=0, max_iter=10000,
                    tol=1e-9).fit(X_new)
        label = km.predict(X_new)
        accu = matchfn(y, label)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        Sum_of_squared_distances.append(km.inertia_)
        k.append(i)
        accuracy.append(accu)
        #score.append(score_train1)
        #print(Sum_of_squared_distances)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    accuracy = np.array(accuracy)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3, = plt.plot(k, accuracy, color='r', marker='o')
    plt.xlabel('k')
    plt.ylabel('accuracy')
    #plt.title('Elbow curve Optimal k')
    #plt.ylim(0,1)
    plt.show()
    return None
def rp(X, y, n_components='auto', eps=0.1, random_state=None, plot=1, dataset='german'):
    rp_model = GaussianRandomProjection(n_components=n_components, eps=eps, random_state=random_state)
    rp_model.fit(X)
    X_new = rp_model.transform(X)
    if plot:
        if dataset == 'german':
            plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1')
            plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0')
            plt.title("German dataset after Randomized Projection")
            plt.legend()
            plt.xlabel("Component 1")
            plt.ylabel("Component 2")
            plt.savefig("german-after-Random-Projection.png")
            plt.close()

        elif dataset == 'australian':
            plt.scatter(X_new[y == 1, 0], X_new[y == 1, 1], c='red', label='Samples with label 1')
            plt.scatter(X_new[y == 0, 0], X_new[y == 0, 1], c='green', label='Samples with label 0')
            plt.title("Australian dataset after Randomized Projection")
            plt.legend()
            plt.xlabel("Component 1")
            plt.ylabel("Component 2")
            plt.savefig("australian-after-Random-Projection.png")
            plt.close()
    return X_new
def save_new_data(dataset, n_components, iteration):
    X, y = load_dataset(dataset)
    data = X
    rp = GaussianRandomProjection(n_components=n_components)
    rp.fit(data)

    matrix = rp.components_
    new_data = rp.transform(data)

    plot_data('rp',
              new_data,
              y,
              dataset.title() + ': RP',
              filename='-'.join(
                  ['rp', dataset,
                   str(iteration), 'data', 'trans']))

    results = np.array(new_data)
    np.savetxt('data/' + ('-'.join(
        [dataset, str(n_components),
         str(iteration) + 'rp.csv'])),
               results,
               delimiter=",")

    new_data_inv = np.dot(new_data, matrix)
    loss = metrics.mean_squared_error(data, new_data_inv)
    print loss
def eps():
    Sum_of_squared_distances = []
    k = []
    score = []
    eps = [0.8, 0.6, 0.4, 0.2, 0.05, 0.01]
    for i in eps:
        transformer = GaussianRandomProjection(n_components=4, eps=i)
        X_new = transformer.fit_transform(X)
        #label=transformer.predict(X)
        km = KMeans(n_clusters=2, random_state=0, max_iter=10000,
                    tol=1e-9).fit(X_new)
        #label=km.predict(X_new)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        Sum_of_squared_distances.append(km.inertia_)
        k.append(i)
        #score.append(score_train1)
        print(Sum_of_squared_distances)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    line1, = plt.plot(k, Sum_of_squared_distances, 'bx-', marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    plt.xlabel('k')
    plt.ylabel('Sum_of_squared_distances')
    plt.title('Elbow curve Optimal eps')
    plt.show()
    return None
    def project_points(points, dim=None):
        if dim is None:
            dim = 5
            #dim = min(max(int(np.log(len(points))), 2), 15)

        proj = GaussianRandomProjection(n_components=dim)
        return proj.fit_transform(points)
예제 #14
0
 def __init__(self, nComp):
     self._N_COMP = nComp
     self._pca = PCA(n_components=self._N_COMP, random_state=17)
     self._tsvd = TruncatedSVD(n_components=self._N_COMP, random_state=17)
     self._ica = FastICA(n_components=self._N_COMP, random_state=17)
     self._grp = GaussianRandomProjection(n_components=self._N_COMP, eps=0.1, random_state=17)
     self._srp = SparseRandomProjection(n_components=self._N_COMP, dense_output=True, random_state=17)
예제 #15
0
파일: part2.py 프로젝트: rbaxter1/CS7641
 def rp_analysis(self, X_train, X_test, y_train, y_test, data_set_name):
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     
     ks = []
     for i in range(1000):
         ##
         ## Random Projection
         ##
         rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
         rp.fit(X_train_scl)
         X_train_rp = rp.transform(X_train_scl)
         
         ks.append(kurtosis(X_train_rp))
         
     mean_k = np.mean(ks, 0)
         
     ##
     ## Plots
     ##
     ph = plot_helper()
     
     title = 'Kurtosis (Randomized Projection) for ' + data_set_name
     name = data_set_name.lower() + '_rp_kurt'
     filename = './' + self.out_dir + '/' + name + '.png'
     
     ph.plot_simple_bar(np.arange(1, len(mean_k)+1, 1),
                        mean_k,
                        np.arange(1, len(mean_k)+1, 1).astype('str'),
                        'Feature Index',
                        'Kurtosis',
                        title,
                        filename)
예제 #16
0
def eval_RP(X_train, dims):
    tmp = defaultdict(dict)
    for d in dims:
        pdc = 0
        rec_err = 0
        tmp[d]['pdc'] = []
        tmp[d]['rec'] = []
        for i in range(30):
            rp = GaussianRandomProjection(random_state=i, n_components=d)
            trans = rp.fit_transform(X_train)
            pdc = pairwiseDistCorr(trans, X_train)
            rec_err = reconstructionError(rp, X_train)
            tmp[d]['pdc'].append(round(pdc, 4))
            tmp[d]['rec'].append(round(rec_err, 4))
    pd.DataFrame(tmp).T

    tmp_sum = defaultdict(dict)
    for d in dims:
        pdc = 0
        rec_err = 0
        print(d, round(mean(tmp[d]['pdc']), 3), round(np.std(tmp[d]['pdc']),
                                                      3),
              round(mean(tmp[d]['rec']), 3))
        tmp_sum[d] = (d, round(mean(tmp[d]['pdc']),
                               3), round(np.std(tmp[d]['pdc']),
                                         3), round(mean(tmp[d]['rec']), 3))
    return tmp, tmp_sum
예제 #17
0
파일: part2.py 프로젝트: rbaxter1/CS7641
 def best_rp_nba(self):
     dh = data_helper()
     X_train, X_test, y_train, y_test = dh.get_nba_data()
     
     scl = RobustScaler()
     X_train_scl = scl.fit_transform(X_train)
     X_test_scl = scl.transform(X_test)
     
     rp = GaussianRandomProjection(n_components=X_train_scl.shape[1])
     X_train_transformed = rp.fit_transform(X_train_scl, y_train)
     X_test_transformed = rp.transform(X_test_scl)
     
     ## top 2
     kurt = kurtosis(X_train_transformed)
     i = kurt.argsort()[::-1]
     X_train_transformed_sorted = X_train_transformed[:, i]
     X_train_transformed = X_train_transformed_sorted[:,0:2]
     
     kurt = kurtosis(X_test_transformed)
     i = kurt.argsort()[::-1]
     X_test_transformed_sorted = X_test_transformed[:, i]
     X_test_transformed = X_test_transformed_sorted[:,0:2]
     
     # save
     filename = './' + self.save_dir + '/nba_rp_x_train.txt'
     pd.DataFrame(X_train_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_x_test.txt'
     pd.DataFrame(X_test_transformed).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_train.txt'
     pd.DataFrame(y_train).to_csv(filename, header=False, index=False)
     
     filename = './' + self.save_dir + '/nba_rp_y_test.txt'
     pd.DataFrame(y_test).to_csv(filename, header=False, index=False)
예제 #18
0
def dRuns(x_data, n):
    k1 = []

    if n == 'PCA' or n == 'SVD':
        for z in range(2, np.shape(x_data)[1] + 1):
            if n == 'PCA':
                pca = PCA(n_components=z)
                newData = pca.fit_transform(x_data)
                varRatio = pca.explained_variance_ratio_
            elif n == 'SVD':
                svd = TruncatedSVD(n_components=z)
                newData = svd.fit_transform(x_data)
                varRatio = svd.explained_variance_ratio_
            if np.sum(varRatio) >= 0.80:
                return z

    if n == 'ICA' or n == 'Random':
        for z in range(2, np.shape(x_data)[1] + 1):
            if n == 'ICA':
                ica = FastICA(n_components=z)
                newData = ica.fit_transform(x_data)
                newData = pd.DataFrame(newData)
                k1.append(np.mean(newData.kurt()))
            else:
                randProjection = GaussianRandomProjection(n_components=z)
                newData = randProjection.fit_transform(x_data)
                newData = pd.DataFrame(newData)
                k1.append(np.mean(newData.kurt()))

        return np.argmax(k1) + 2
예제 #19
0
class Loda:
    def __init__(self, projections=50, bins=10):
        self.k = projections
        self.bins = bins
        self.rprog, self.histograms = None, None

    @staticmethod
    def get_bin_density(v, histogram):
        hist, bin_edges = histogram
        for i, be in enumerate(bin_edges):
            if v <= be: break
        i = max(i - 1, 0)
        return i, hist[i]

    def fit(self, X):
        if self.k is not None:
            self.rprog = GaussianRandomProjection(n_components=self.k).fit(X)
        XX = self.rprog.transform(X) if self.rprog is not None else X
        self.histograms = [
            np.histogram(XX[:, j], bins=self.bins, density=True)
            for j in range(XX.shape[1])
        ]
        return self

    def transform(self, X):
        XX = self.rprog.transform(X) if self.rprog is not None else X
        anomaly_vect = lambda xx: [
            -np.log(self.get_bin_density(xx_j, histo)[1])
            for (xx_j, histo) in zip(xx, self.histograms)
        ]
        return np.array([anomaly_vect(xx) for xx in XX])
예제 #20
0
파일: main.py 프로젝트: Cphrampus/CS7641
def plot_rca_curve(data):
    scaler = StandardScaler()
    scaler.fit(data)
    x_train_scaler = scaler.transform(data)

    # reconstruction error by components
    recon_errs = []
    sizes = range(1, 12)
    stds = []
    for size in sizes:
        losses = []
        for state in [5, 30, 50, 200, 0]:
            rca = GaussianRandomProjection(n_components=size,
                                           random_state=state)
            transformed_data = rca.fit_transform(x_train_scaler)
            inverse_data = np.linalg.pinv(rca.components_.T)
            reconstructed_data = transformed_data.dot(inverse_data)
            loss = ((x_train_scaler - reconstructed_data)**2).mean()
            losses.append(loss)
        recon_errs.append(loss)
        stds.append(np.std(losses))

    print(f"rca avg std: {np.mean(stds)}")

    plt.figure()
    plt.title('recon error by Number of Components')
    plt.ylabel('recon error')
    plt.xlabel('Components')
    plt.plot(sizes, recon_errs)
예제 #21
0
def run_k_means_on_random_projections_cardiovascular_data(path):
    data_set = 'cardio'
    x_train, y_train = load_data(path + 'data/' + data_set + '/train/')
    # X, y = load_data(path + 'data/' + data_set + '/train/')

    pca = GaussianRandomProjection(n_components=5)
    pca_x_train = pca.fit_transform(x_train)

    f = open("cardiovascular_random_projections_stats.txt","w+")

    bench_k_means("1", pca_x_train, y_train, 1, f, 1)
    bench_k_means("2", pca_x_train, y_train, 2, f, 1)
    bench_k_means("3", pca_x_train, y_train, 3, f, 1)
    bench_k_means("4", pca_x_train, y_train, 4, f, 1)
    bench_k_means("5", pca_x_train, y_train, 5, f, 1)
    bench_k_means("6", pca_x_train, y_train, 6, f, 1)
    bench_k_means("7", pca_x_train, y_train, 7, f, 1)
    bench_k_means("8", pca_x_train, y_train, 8, f, 1)
    bench_k_means("9", pca_x_train, y_train, 9, f, 1)
    bench_k_means("10", pca_x_train, y_train, 10, f, 1)
    bench_k_means("11", pca_x_train, y_train, 11, f, 1)
    bench_k_means("12", pca_x_train, y_train, 12, f, 1)
    bench_k_means("13", pca_x_train, y_train, 13, f, 1)
    bench_k_means("14", pca_x_train, y_train, 14, f, 1)
    bench_k_means("15", pca_x_train, y_train, 15, f, 1)
    f.close()
def PerformRandomProjections(X,Y,num_components,random_state):
    """
    For each num_components, random_state number of times
    random projection is done and that projection is kept
    that gives minimum reconstruction error
    """
    result = {}
    recons_errs = []
    for n in num_components:
        prefix = "rp_" + str(n) + "_"
        best_grp = None
        best_reconstruction_error = np.Infinity;
        reconstruction_errors = []
        for i in np.arange(random_state) + 1:
            grp = GaussianRandomProjection(n,random_state=i)
            grp.fit(X)
            _x = grp.transform(X)
            p_inv = np.linalg.pinv(grp.components_)
            X_recons = np.dot(p_inv,_x.T).T
            recons_err = ComputeReconstructionSSE(X,X_recons)
            reconstruction_errors.append(recons_err)
            #print(r"n = {0} i ={1} error = {2}".format(n,i,recons_err))
            if(best_grp is None or best_reconstruction_error > recons_err):
                best_grp = grp
                best_reconstruction_error = recons_err
        result[prefix+"data"] = best_grp.transform(X)
        result[prefix+"reconstruction_errors_all"] = reconstruction_errors
        result[prefix+"reconstruction_error"] = best_reconstruction_error
    return result
def dimensionality_reduction():
    ica_best_components = 5
    pca_best_components = 6
    rp_chosen_components = 3
    variance_threshold = 0.02
    pca = PCA(n_components=pca_best_components)
    pca_x_train = pca.fit_transform(x_train)
    base_experiment.plot_eigen_values("{}-{}".format(plot_name, "PCA"),
                                      pca.explained_variance_)
    base_experiment.plot_points_3d("{}-{}".format(plot_name, "PCA"),
                                   pca_x_train)
    ica = FastICA(n_components=ica_best_components)
    ica_x_train = ica.fit_transform(x_train)
    base_experiment.plot_points_3d("{}-{}".format(plot_name, "ICA"),
                                   ica_x_train)
    rp = GaussianRandomProjection(n_components=rp_chosen_components)
    rp_x_train = rp.fit_transform(x_train)
    base_experiment.plot_points_3d(
        "{}-{}".format(plot_name, "Random Projection"), rp_x_train)
    variance_x_train = VarianceThreshold(
        threshold=variance_threshold).fit_transform(
            min_max_scaler.transform(features_data))
    variance_x_train = preprocessing.scale(variance_x_train)
    find_best_k_for_reduced_features(ica_x_train, pca_x_train, rp_x_train,
                                     variance_x_train)
    clustering_after_reduction(pca_x_train, ica_x_train, rp_x_train,
                               variance_x_train)
    run_ann_with_only_dimensionality_reduction(pca_x_train, ica_x_train,
                                               rp_x_train, variance_x_train)
예제 #24
0
def rp(name, x, y):
    plot.style.use('seaborn-darkgrid')

    for i in range(6):
        rp = GaussianRandomProjection(eps=0.95, random_state=i)
        transformed = rp.fit_transform(x)

        axes = [0, 0]
        axes_std = [0, 0]

        for axis in range(np.shape(transformed)[1]):
            std = np.std(transformed[:, axis])
            if std > axes_std[0]:
                axes[0] = axis
                axes_std[0] = std
            elif std > axes_std[1]:
                axes[1] = axis
                axes_std[1] = std

        plot.subplot(2, 3, i + 1)
        plot.title(f'Random seed = {i}')
        plot.xlabel(f'Dimension {axes[0]}')
        plot.ylabel(f'Dimension {axes[1]}')
        plot.scatter(transformed[:, axes[0]],
                     transformed[:, axes[1]],
                     c=y,
                     cmap='viridis')

    plot.show()
예제 #25
0
def rand_proj_reconstruction_error(train_x, n):
    ''' '''

    results = []

    for i in range(1, n, 10):

        for j in range(1, 11):

            error = 0

            rand_proj = GaussianRandomProjection(n_components=n)
            reduced_df = rand_proj.fit_transform(train_x)

            psuedo_inverse = np.linalg.pinv(rand_proj.components_.T)
            reconstructed = reduced_df.dot(psuedo_inverse)

            error += metrics.mean_squared_error(train_x, reconstructed)
            # # error = (np.linalg.norm(train_x - reconstructed) ** 2) / len(train_x)
            # # error = np.sum(np.square(train_x - reconstructed))
            # error = np.mean((train_x - reconstructed)**2)
            # error =  ((train_x - reconstructed) ** 2).sum(1).mean()

        results.append({"n_components": i, "reconstruction_error": error / 10})

    return results
예제 #26
0
class Coder(object):
    def __init__(self, n_sketches, sketch_dim):
        self.n_sketches = n_sketches
        self.sketch_dim = sketch_dim
        self.ss = StandardScaler()
        self.sp = GaussianRandomProjection(n_components=16 * n_sketches)

    def fit(self, v):
        self.ss = self.ss.fit(v)
        vv = self.ss.transform(v)
        self.sp = self.sp.fit(vv)
        vvv = self.sp.transform(vv)
        self.init_biases(vvv)

    def transform(self, v):
        v = self.ss.transform(v)
        v = self.sp.transform(v)
        v = self.discretize(v)
        v = np.packbits(v, axis=-1)
        v = np.frombuffer(np.ascontiguousarray(v), dtype=np.uint16).reshape(
            v.shape[0], -1) % self.sketch_dim
        return v

    def transform_to_absolute_codes(self, v, labels=None):
        codes = self.transform(v)
        pos_index = np.array(
            [i * self.sketch_dim for i in range(self.n_sketches)],
            dtype=np.int_)
        index = codes + pos_index
        return index
예제 #27
0
def transform(data, alg):
    a = np.array(data)
    x = a[:, 0:-1]
    y = a[:, -1]
    if alg == 'pca':
        pca = PCA(n_components=6, whiten=True)
        x = pca.fit(x).transform(x)
        print(pca.components_)
        print(pca.explained_variance_ratio_)
    if alg == 'ica':
        kur0 = sum(kurtosis(x))
        ica = FastICA(n_components=3, whiten=False, algorithm="parallel")
        ica = ica.fit(x)
        x = ica.transform(x)
        print("kurtosis: ", sum(kurtosis(x)) - kur0)
    if alg == 'rp':
        rp = GaussianRandomProjection(n_components=1)
        rp = rp.fit(x)
        x = rp.transform(x)
        print(rp.components_)
    if alg == 'vtresh':
        kb = VarianceThreshold(threshold=.04)
        x = kb.fit_transform(x)
        print(kb.variances_)
    data = np.column_stack((x, y))
    return data
def comp1(K):
    Sum_of_squared_distances = []
    k = []
    accuracy_train = []
    accuracy_test = []
    score = []
    for i in range(1, K):
        print(i)
        agglo = GaussianRandomProjection(n_components=10, eps=0.6)
        #X_new_train,y_new_train=transformer.fit(X_train,y_train)
        #X_new_test,y_new_test = transformer.transform(X_test,y_test)
        agglo.fit(X)
        X_reduced = agglo.transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X_reduced,
                                                            y,
                                                            test_size=0.20)
        km = MLPClassifier(solver='lbfgs',
                           alpha=1e-5,
                           hidden_layer_sizes=[8, 8, 8, 8, 8],
                           random_state=1)
        km.fit(X_train, y_train)
        km.fit(X_test, y_test)
        #transformer1 = GaussianRandomProjection(n_components=i,eps=0.5)
        #transformer2 = GaussianRandomProjection(n_compo
        label_train = km.predict(X_train)
        label_test = km.predict(X_test)
        accu_train = km.score(X_test, y_test)
        accu_test = km.score(X_train, y_train)
        #score_train1=metrics.silhouette_score(X_new,label, metric='euclidean')
        #Sum_of_squared_distances.append(km.inenents=i,eps=0.6)
        #label=transformer.predicn)rtia_)
        k.append(i)
        accuracy_train.append(accu_train)
        accuracy_test.append(accu_test)
        #score.append(score_train1)
        #print(accuracy)
    k = np.array(k)
    Sum_of_squared_distances = np.array(Sum_of_squared_distances)
    score = np.array(score)
    accuracy_train = np.array(accuracy_train)
    accuracy_test = np.asarray(accuracy_test)
    #line1,=plt.plot(k, Sum_of_squared_distances, 'bx-',marker='o')
    #line2,=plt.plot(k,score,color='g',marker='o')
    line3, = plt.plot(k,
                      accuracy_train,
                      color='r',
                      marker='o',
                      label='train_accuracy')
    line4, = plt.plot(k,
                      accuracy_test,
                      color='g',
                      marker='o',
                      label='test_accuracy')
    #plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
    plt.xlabel('k')
    plt.legend()
    plt.ylabel('accuracy')
    #plt.ylim(0,1)
    plt.show()
    return None
예제 #29
0
def rand_proj(train_x, n):
    ''' '''

    rp = GaussianRandomProjection(n_components=n)
    reduced_df = rp.fit_transform(train_x)

    return reduced_df
def RP_exp(X, y, title):
    ncomp= [i+1 for i in range(X.shape[1]-1)]
    stdev=[]
    mean=[]
    for n in ncomp:
        repeats = []
        for i in range(5):
            rp = GaussianRandomProjection(n_components=n)
            temp = rp.fit_transform(X)
            repeats.append(temp)

        diffs = []
        for (i, j) in [(0, 1), (0, 2), (0, 3), (0, 4), (1, 2), (1, 3), (1, 4), (2, 3), (2, 4), (3, 4)]:
            diffs.append(repeats[i] - repeats[j])
        stdev.append(np.std(diffs))
        mean.append(np.mean(diffs))

    comp_arr=np.array(ncomp)
    mean_arr=np.array(mean)
    stdev_arr=np.array(stdev)

    plt.fill_between(comp_arr, mean_arr-stdev_arr,
                    mean_arr + stdev_arr, alpha=0.1,
                         color="b", label="Stdev")
    plt.plot(ncomp, mean, 'o-', color="b", label="Mean")
    plt.title("Mean pairwise difference of RP: "+ title)
    plt.legend(loc='best')
    plt.xlabel("n_components")
    plt.ylabel("Pairwise difference")
    plt.savefig("RP "+title)
    plt.show()
예제 #31
0
def randproj(tx, ty, rx, ry):
    compressor = RandomProjection(tx[1].size)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr", times=10)
    km(newtx, ty, newrx, ry, add="wRPtr", times=10)
    nn(newtx, ty, newrx, ry, add="wRPtr")
예제 #32
0
def otherScikitImpls(data):
    rp = GaussianRandomProjection(n_components=new_dimension)
    m = rp._make_random_matrix(new_dimension, orig_dimension)
    m = np.mat(m)
    reduced = m * np.mat(data).transpose()

    reduced = reduced.transpose()
    return reduced
예제 #33
0
파일: analysis.py 프로젝트: jj4192/MLp3
def randproj(tx, ty, rx, ry):
    compressor = RandomProjection(tx[1].size)
    newtx = compressor.fit_transform(tx)
    compressor = RandomProjection(tx[1].size)
    newrx = compressor.fit_transform(rx)
    #em(newtx, ty, newrx, ry, add="wRPtr", times=10)
    #km(newtx, ty, newrx, ry, add="wRPtr", times=10)
    nn(newtx, ty, newrx, ry, add="wRP")    
예제 #34
0
def run_grp(n_c, X_train, X_test, y_train, y_test):
    from sklearn.random_projection import GaussianRandomProjection
    grp = GaussianRandomProjection(n_components=n_c, eps=0.1)
    X_train = grp.fit_transform(X_train, y_train)
    X_test = grp.transform(X_test)
    print("grp components: ", grp.n_components_)

    return [X_train, X_test]
예제 #35
0
def randproj(tx, ty, rx, ry):
    compressor = RandomProjection(tx[1].size)
    newtx = compressor.fit_transform(tx)
    compressor = RandomProjection(tx[1].size)
    newrx = compressor.fit_transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr", times=10)
    km(newtx, ty, newrx, ry, add="wRPtr", times=10)
    nn(newtx, ty, newrx, ry, add="wRPtr")
예제 #36
0
def randproj(tx, ty, rx, ry):
    print "randproj"
    compressor = RandomProjection(tx[1].size)
    compressor.fit(tx, y=ty)
    newtx = compressor.transform(tx)
    # compressor = RandomProjection(tx[1].size)
    newrx = compressor.transform(rx)
    em(newtx, ty, newrx, ry, add="wRPtr")
    km(newtx, ty, newrx, ry, add="wRPtr")
    nn(newtx, ty, newrx, ry, add="wRPtr")
    print "randproj done"
예제 #37
0
    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = GaussianRandomProjection(n_components=num_components)
예제 #38
0
def find_best_rp(train_data, test_data, start_components, max_features):
    # find random proj with lowest reconstruction error
    best_c = 0
    #scores = []
    #best_k = 0
    #best = 0
    best_rs = 0
    best_r = 20000

    # go to max - 1 since it doesn't make sense to randomly project to the same dimension
    r = range(start_components, max_features)
    print(r)
    # center data for reconstruction
    scalar = StandardScaler(with_mean=True, with_std=False)
    centered = scalar.fit_transform(test_data)
    for c in r:
        print('C=%d' % c)
        for rs in range(1, 501):
            rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data)
            fit = rp.transform(centered)
            recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_
            err = linalg.norm(test_data - recon)
            if err < best_r:
                best_r = err
                best_c = c
                best_rs = rs
    print('best reconstruction error=%.4f' % best_r)
    print('>>best rs=%d,c=%d' % (best_rs, best_c))

    # for the best, track the variation
    v_max = 0
    errsum = 0
    for rs in range(1, 501):
        rp = GaussianRandomProjection(n_components=c, random_state=rs).fit(train_data)
        fit = rp.transform(centered)
        recon = extmath.safe_sparse_dot(fit, rp.components_) + scalar.mean_
        err = linalg.norm(test_data - recon)
        errsum += err
        if err > v_max:
            v_max = err

    print('RP max:%.3f, avg:%.3f' % (v_max, errsum/500))

    return best_c, best_rs
 def fit(self, X, y, sample_weight=None):
     self.classes_ = numpy.array([0, 1])
     self.proj = GaussianRandomProjection(n_components=self.n_components)
     # self.knner = KNeighborsClassifier(n_neighbors=self.knn)
     self.knner = Knn1dClassifier(self.knn)
     self.proj.fit(X)
     X_new = self.proj.transform(X)
     # TODO sample weight!!
     self.knner.fit(X_new, y, sample_weight=sample_weight)
     print('ok')
     return self
예제 #40
0
def test_fixed_state_transformer():

    random_state = check_random_state(0)
    X = random_state.rand(500, 100)

    # Check that setting the random_seed is equivalent to set the
    # random_state
    transf = GaussianRandomProjection(n_components=5, random_state=0)
    fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5), random_seed=0)
    assert_array_almost_equal(fixed_transf.fit_transform(X), transf.fit_transform(X))

    # Check that set_params doesn't modify the results
    fixed_transf = FixedStateTransformer(GaussianRandomProjection(n_components=5, random_state=None))

    fixed_transf2 = FixedStateTransformer(GaussianRandomProjection(random_state=1, n_components=5))

    assert_array_almost_equal(fixed_transf.fit_transform(X), fixed_transf2.fit_transform(X))

    # Check that it work when there is no random_state
    fixed_transf = FixedStateTransformer(IdentityProjection())
    assert_array_almost_equal(fixed_transf.fit_transform(X), X)
class ProjClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, n_components=1, knn=100):
        self.n_components = n_components
        self.knn = knn

    def fit(self, X, y, sample_weight=None):
        self.classes_ = numpy.array([0, 1])
        self.proj = GaussianRandomProjection(n_components=self.n_components)
        # self.knner = KNeighborsClassifier(n_neighbors=self.knn)
        self.knner = Knn1dClassifier(self.knn)
        self.proj.fit(X)
        X_new = self.proj.transform(X)
        # TODO sample weight!!
        self.knner.fit(X_new, y, sample_weight=sample_weight)
        print('ok')
        return self

    def predict_proba(self, X):
        X_new = self.proj.transform(X)
        return self.knner.predict_proba(X_new)

    def predict(self, X):
        return numpy.argmax(self.predict_proba(X), axis=1)
예제 #42
0
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

# Append decomposition components to datasets
for i in range(1, n_comp+1):
    train['pca_' + str(i)] = pca2_results_train[:,i-1]
    test['pca_' + str(i)] = pca2_results_test[:, i-1]

    train['ica_' + str(i)] = ica2_results_train[:,i-1]
    test['ica_' + str(i)] = ica2_results_test[:, i-1]
예제 #43
0
파일: feature.py 프로젝트: zgcgreat/WSDM
def gen_feature(train, test):
    train = pd.DataFrame(train)
    test = pd.DataFrame(test)

    n_comp = 15
    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, test
예제 #44
0

ap_region_data = { k:v for (k,v) in region_data.items() if ap_champs[k[2]]}
ap_tier_data = { k:v for (k,v) in tier_data.items() if ap_champs[k[2]]}
ap_cross_data = { k:v for (k,v) in cross_data.items() if ap_champs[k[3]]}
ap_patch_data = { k:v for (k,v) in patch_data.items() if ap_champs[k[1]]}

all_ap_data = ap_region_data.values() + (ap_tier_data.values()) + ap_cross_data.values() + ap_patch_data.values()

#print(ap_champs)

#pca = PCA(n_components=2)
#reduction = pca.fit_transform(ap_champs.values())
#print(pca.explained_variance_ratio_)

grp = GaussianRandomProjection(2, random_state = 0)
grp.fit(all_ap_data)
region_reduction = grp.transform(ap_region_data.values())
tier_reduction = grp.transform(ap_tier_data.values())
cross_reduction = grp.transform(ap_cross_data.values())
patch_reduction = grp.transform(ap_patch_data.values())

region_json_data = []
for i in range(0,len(ap_region_data.keys())):
	key = ap_region_data.keys()[i]
	data = list(region_reduction[i])
	num_games = region_games[key]
	region_json_data.append( {
		"patch":key[0],
		"region":key[1],
		#"tier":key[2],
def random(X, K):
    grp = GaussianRandomProjection(n_components=K)
    X_red = grp.fit_transform(X)
    X_red = normalizer.fit_transform(X_red)
    return X_red
import numpy as np

# Load 20 newsgroup dataset
# Selec tonly sci.crypt category
# Other categories include 
# sci.med sci.space soc.religion.christian
cat = ['sci.crypt']
data = fetch_20newsgroups(categories=cat)

# Creat a term document matrix with term frequencies as the values frmo the
# above dataset
vectorizer = TfidfVectorizer(use_idf=False)
vector = vectorizer.fit_transform(data.data)

# Perform the projection. In this case we reduce the dimension to 1000
gauss_proj = GaussianRandomProjection(n_components=1000)
gauss_proj.fit(vector)
# Transform the original data to the new space
vector_t = gauss_proj.transform(vector)

# Print transformed vector shape
print vector.shape
print vector_t.shape

# To validate if the transformation has preserved the distance, we calculate the
# old and the new distance between the points
org_dist = euclidean_distances(vector)
red_dist = euclidean_distances(vector_t)
diff_dist = abs(org_dist - red_dist)

# We take the difference between these points and plot them as a heatmap (only
# Let's first generate a set of samples
n_samples = 2000
n_outputs = 500
X = 3 + 5 * random_state.normal(size=(n_samples, n_outputs))

# Let's compute the sum of the variance in the orignal output space
var_origin = np.var(X, axis=0).sum()

# Let's compute the variance on a random subspace
all_n_components = np.array([1, 50, 100, 200, 400, 500])
n_repetitions = 10
distortion = np.empty((len(all_n_components), n_repetitions))

for i, n_components in enumerate(all_n_components):
    for j in range(n_repetitions):
        transformer = GaussianRandomProjection(n_components=n_components,
                                               random_state=random_state)
        X_subspace = transformer.fit_transform(X)
        distortion[i, j] = np.var(X_subspace, axis=0).sum() / var_origin

# Let's plot the distortion as a function of the compression ratio
distortion_mean = distortion.mean(axis=1)
distortion_std = distortion.std(axis=1)

plt.figure()
plt.plot(all_n_components / n_outputs, distortion_mean, "o-", color="g")
plt.plot(all_n_components / n_outputs, np.ones_like(distortion_mean),
         "--", color="r")
plt.fill_between(all_n_components / n_outputs,
                 distortion_mean - distortion_std,
                 distortion_mean + distortion_std, alpha=0.25, color="g")
plt.xlabel("n_components / n_outputs")
def select_features_GaussianRandomProjections(train_X, train_y, test_X, k):
    selector = GaussianRandomProjection(n_components=k, random_state=42)
    selector.fit(train_X)
    train_X = selector.transform(train_X)
    test_X = selector.transform(test_X)
    return train_X, test_X
예제 #49
0
def Random_Projection(M, new_dim, prng):
    proj = GaussianRandomProjection(n_components=new_dim, eps=0.1, random_state=None)
    return proj.fit_transform(M)
예제 #50
0
파일: feature.py 프로젝트: zgcgreat/WSDM
def gen_features(train, val, test):
    train = pd.DataFrame(train)
    val = pd.DataFrame(val)
    test = pd.DataFrame(test)
    # cat_cols = ['city', 'bd', 'gender', 'registered_via', 'registration_init_year',
    #              'registration_init_month', 'registration_init_date', 'payment_method_id', 'payment_plan_days',
    #              'plan_list_price', 'actual_amount_paid', 'is_auto_renew', 'is_cancel',
    #              'transaction_date_year', 'transaction_date_month', 'transaction_date_date',
    #              'membership_expire_date_year',
    #              'membership_expire_date_month', 'membership_expire_date_date', 'membership_transaction_gap',
    #              'cancel_times',
    #              'auto_renew_count', 'plan_net_worth', 'user_date_year', 'user_date_month',
    #              'user_date_date']
    # con_cols = [x for x in train.columns if x not in cat_cols and x not in ['msno', 'is_churn']]
    # train[cat_cols] = train[cat_cols].astype('object')
    # test[cat_cols] = test[cat_cols].astype('object')
    # val[cat_cols] = val[cat_cols].astype('object')
    #
    # for col in cat_cols:
    #     train[col].fillna(value=train[col].mode()[0], inplace=True)
    #     test[col].fillna(value=test[col].mode()[0], inplace=True)
    #     val[col].fillna(value=val[col].mode()[0], inplace=True)
    # for col in con_cols:
    #     train[col].fillna(value=train[col].mean(), inplace=True)
    #     test[col].fillna(value=test[col].mean(), inplace=True)
    #     val[col].fillna(value=val[col].mean(), inplace=True)
    #
    # for c in train.columns:
    #     if train[c].dtype == 'object':
    #         lbl = LabelEncoder()
    #         lbl.fit(list(train[c].values) + list(test[c].values))
    #         train[c] = lbl.transform(list(train[c].values))
    #         test[c] = lbl.transform(list(test[c].values))

    n_comp = 15

    drop_list = []
    test_drop_list = []

    print(train.drop(drop_list, axis=1).shape, test.drop(test_drop_list, axis=1).shape)
    print('tSVD', datetime.now() - start)
    # tSVD
    tsvd = TruncatedSVD(n_components=n_comp)
    tsvd_results_train = tsvd.fit_transform(train.drop(drop_list, axis=1))
    tsvd_results_val= tsvd.transform(val.drop(test_drop_list, axis=1))
    tsvd_results_test = tsvd.transform(test.drop(test_drop_list, axis=1))

    print('PCA', datetime.now() - start)
    # PCA
    pca = PCA(n_components=n_comp)
    pca2_results_train = pca.fit_transform(train.drop(drop_list, axis=1))
    pca2_results_val = pca.transform(val.drop(test_drop_list, axis=1))
    pca2_results_test = pca.transform(test.drop(test_drop_list, axis=1))

    print('ICA', datetime.now() - start)
    # ICA
    ica = FastICA(n_components=n_comp, max_iter=10000)
    ica2_results_train = ica.fit_transform(train.drop(drop_list, axis=1))
    ica2_results_val = ica.transform(val.drop(test_drop_list, axis=1))
    ica2_results_test = ica.transform(test.drop(test_drop_list, axis=1))

    print('GRP', datetime.now() - start)
    # GRP
    grp = GaussianRandomProjection(n_components=n_comp, eps=0.1)
    grp_results_train = grp.fit_transform(train.drop(drop_list, axis=1))
    grp_results_val = grp.transform(val.drop(test_drop_list, axis=1))
    grp_results_test = grp.transform(test.drop(test_drop_list, axis=1))

    print('SRP', datetime.now() - start)
    # SRP
    srp = SparseRandomProjection(n_components=n_comp, dense_output=True)
    srp_results_train = srp.fit_transform(train.drop(drop_list, axis=1))
    srp_results_val = srp.transform(val.drop(test_drop_list, axis=1))
    srp_results_test = srp.transform(test.drop(test_drop_list, axis=1))

    # MCA
    # res_mca = MCA(train, ncp=n_comp, graph = FALSE)

    # save columns list before adding the decomposition components

    usable_columns = list(set(train.columns) - set(drop_list))

    # Append decomposition components to datasets
    for i in range(1, n_comp + 1):
        train['pca_' + str(i)] = pca2_results_train[:, i - 1]
        val['pca_' + str(i)] = pca2_results_val[:, i - 1]
        test['pca_' + str(i)] = pca2_results_test[:, i - 1]

        train['ica_' + str(i)] = ica2_results_train[:, i - 1]
        val['ica_' + str(i)] = ica2_results_val[:, i - 1]
        test['ica_' + str(i)] = ica2_results_test[:, i - 1]

        train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
        val['tsvd_' + str(i)] = tsvd_results_val[:, i - 1]
        test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

        train['grp_' + str(i)] = grp_results_train[:, i - 1]
        val['grp_' + str(i)] = grp_results_val[:, i - 1]
        test['grp_' + str(i)] = grp_results_test[:, i - 1]

        train['srp_' + str(i)] = srp_results_train[:, i - 1]
        val['srp_' + str(i)] = srp_results_val[:, i - 1]
        test['srp_' + str(i)] = srp_results_test[:, i - 1]
    return train, val, test
예제 #51
0
# Perform Truncated Singular Value Decomposition (SVD)
from sklearn.decomposition import TruncatedSVD as TruncSVD
tsvd = TruncSVD(n_components=num_components,  algorithm='randomized', random_state=0)
tsvd_transformed_data_train = tsvd.fit_transform(sparse_trainData)
tsvd_transformed_data_valid = tsvd.transform(sparse_validData)

# Perform Randomized Principal Components Analysis (PCA)
from sklearn.decomposition import RandomizedPCA as RPCA
rpca = RPCA(n_components=num_components)
rpca_transformed_data_train = rpca.fit_transform(dense_trainData)
rpca_transformed_data_valid = rpca.transform(dense_validData)

# Perform Gaussian Random Projection
from sklearn.random_projection import GaussianRandomProjection as GaussRan
grp = GaussRan(n_components=num_components)
grp_transformed_data_train = grp.fit_transform(dense_trainData)
grp_transformed_data_valid = grp.transform(dense_validData)

# Perform Sparse Random Projection
from sklearn.random_projection import SparseRandomProjection as SparseRan
srp = SparseRan(n_components=num_components, random_state=0)
srp_transformed_data_train = srp.fit_transform(dense_trainData)
srp_transformed_data_valid = srp.transform(dense_validData)

# Perform classification using 1-Nearest Neighbor Classifier
from sklearn.neighbors import KNeighborsClassifier

# Create a subset grid to plot performance against numbers of components
tsvd_max = tsvd_transformed_data_train.shape[1]
plot_subset = []
예제 #52
0
int10 = np.array(list(map(map_bin_dec, bin_feats)))
int10 = int10 / max(int10)

df_non_obj_feats['binSum'] = df_non_obj_feats.apply(sum, 1)
df_non_obj_feats['binDec'] = int10

all_data_proc = pd.concat((df_obj_feats_freq, df_non_obj_feats), axis=1)

#%%
from sklearn.decomposition import PCA, FastICA
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
n_comp = 12

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results = grp.fit_transform(all_data_proc)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results = srp.fit_transform(all_data_proc)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca_results = pca.fit_transform(all_data_proc)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica_results = ica.fit_transform(all_data_proc)
for i in range(1, n_comp+1):
    all_data_proc['pca_' + str(i)] = pca_results[:,i-1]
예제 #53
0
def gaussianRP(data):
    rp = GaussianRandomProjection(n_components=new_dimension)
    return rp.fit_transform(data)
from sklearn.mixture import GMM
from load_mydata import LoadData
import math

mushroom = LoadData("mushroom")
data = scale(mushroom.data)
labels = np.array(mushroom.labels)

n_samples, n_features = data.shape
n_digits = len(np.unique(labels))
n_iter = 1000

print("n_digits: %d, \t n_samples %d, \t n_features %d"
      % (n_digits, n_samples, n_features))
t0 = time()
rp = GaussianRandomProjection(n_components=20)
reduced_data = rp.fit_transform(data)
print("time spent: %0.3fs" % (time()-t0))
#reduced_data = data

# Plot the data
fig=plt.figure()
#plt.clf()
n_plots=9
h = 0.02
x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max()
y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max()
for index in range(1,n_plots+1):
   vert=math.floor(math.sqrt(n_plots))
   hori=n_plots/vert
   fig.add_subplot(vert,hori,index)
예제 #55
0
class RCAReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = GaussianRandomProjection(n_components=num_components)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/RCAReduceDigitsOutput.txt', 'w')
        print("RCA Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print("\nProjection axes:\n")
        for i,axis in enumerate(self.reducer.components_.tolist()):
            print("Axis %d:\n" % i, axis)
        self.compute_plane_variance()

    def compute_plane_variance(self):
        points_along_dimension = self.reduced.T
        for i,points in enumerate(points_along_dimension):
            print("\nVariance of dimension %d:" % i)
            print(np.var(points), "\n")

    def display_reduced_iris(self):
        sys.stdout = open('out/RCAReduceIrisOutput.txt', 'w')
        print("RCA Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print("Length of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print("\nProjection axes:\n")
        for i,axis in enumerate(self.reducer.components_.tolist()):
            print("Axis %d:\n" % i, axis)
        self.compute_plane_variance()

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test
예제 #56
0
					champion_items[key['champ']] [(key["patch"], region, tier)] ["first"] [key['first']] += build['value']
					champion_items[key['champ']] [(key["patch"], region, tier)] ["second"] [key['second']] += build['value']
					champion_items[key['champ']] [(key["patch"], region, tier)] ["third"] [key['third']] += build['value']

					#update champion games played
					champion_games[key['champ']] [(key['patch'], region, tier)] += build['value']

items_json = []
for champ in champion_builds.keys():

	# perform GaussianRandomProjection
	all_builds = []
	for key in champion_builds[champ]:
		all_builds += champion_builds[champ][key]

	grp = GaussianRandomProjection(2, random_state = 0)
	grp.fit(all_builds)

	for key in champion_builds[champ]:
		builds = champion_builds[champ][key]
		reduction = grp.transform(builds)

		# get top 100 builds
		zipped = zip(list(reduction), build_games[champ][key], build_objects[champ][key])
		sorted_zipped = sorted(zipped, key=lambda x: x[1], reverse=True)
		top_builds = sorted_zipped[0:100]

		builds_json = []
		for i in top_builds:
			x = list(i[0])[0]
			y = list(i[0])[1]