Exemplo n.º 1
0
    def run_pca_clustering(self):
        from sklearn.decomposition import PCA as pca

        comp_no_required = int(self.pca_cluster_comp_no.text())
        bkfreecars_2d = anscombe.tran4d_2d(self.bkfreecars)

        if self.pca_cluster_whiten.checkState is True:
            pca_model = pca(n_components=comp_no_required, whiten=True)

        elif self.pca_cluster_whiten.checkState is False:
            pca_model = pca(n_components=comp_no_required, whiten=False)

        self.pca_clusterW = pca_model.fit_transform(bkfreecars_2d.s2 -
                                                    bkfreecars_2d.s1)
        self.pca_clusterH = pca_model.components_

        yi, xi, zi, si = self.bkfreecars.shape
        self.pca_clusterW = np.reshape(self.pca_clusterW,
                                       (yi, xi, comp_no_required))

        self.pca_cluster_choose.setValue(1)
        self.pca_cluster_choose.setMinimum(1)
        self.pca_cluster_choose.setMaximum(comp_no_required)

        self.pca_cluster_image_win.setImage(self.pca_clusterW[:, :, 0])
        self.pca_cluster_spectrum_win.plot(self.pca_clusterH[:, 0])
Exemplo n.º 2
0
    def test_pca(self):
        django.setup()
        from koe.models import Feature, Aggregation, FullTensorData, Database
        from koe.ts_utils import bytes_to_ndarray, get_rawdata_from_binary

        database = Database.objects.get(name='Bellbird_TMI')
        features = Feature.objects.all().order_by('id')
        aggregations = Aggregation.objects.all().order_by('id')
        features_hash = '-'.join(
            list(map(str, features.values_list('id', flat=True))))
        aggregations_hash = '-'.join(
            list(map(str, aggregations.values_list('id', flat=True))))

        full_tensor = FullTensorData.objects.filter(
            database=database,
            features_hash=features_hash,
            aggregations_hash=aggregations_hash).first()
        if full_tensor is None:
            raise Exception('Tensor not found')

        full_sids_path = full_tensor.get_sids_path()
        full_bytes_path = full_tensor.get_bytes_path()

        sids = bytes_to_ndarray(full_sids_path, np.int32)
        full_data = get_rawdata_from_binary(full_bytes_path, len(sids))

        with tictoc('PCA'):
            dim_reduce_func = pca(n_components=50)
            dim_reduce_func.fit_transform(full_data)
Exemplo n.º 3
0
    def select(self, n, n_components=20):

        # drop stocks that are not listed, and save stock index in a dict
        self.df = self.df.dropna(axis=1, how='any')
        ticker_name = self.df.columns
        ticker_dict = {x: y for x, y in enumerate(ticker_name)}

        # construct the pca model and fit the model with data
        sample = self.df.values
        model = pca(n_components=n_components)
        model.fit(sample)

        # compute PCA components and corresponding variance ratio
        pcs = model.components_
        pcs_mat = np.matrix(pcs)
        var_ratio = model.explained_variance_ratio_
        var_ratio_mat = np.matrix(var_ratio)

        # compute overall loadings for each stock
        load_mat = var_ratio_mat * pcs_mat

        # find top 20 stocks with largest loadings
        load_arr = np.asarray(load_mat).reshape(-1)
        load_dict = {y: x for x, y in enumerate(load_arr)}
        sort_load = sorted(load_arr, key=abs, reverse=True)
        top_load = sort_load[:n]
        ticker_num = [load_dict[x] for x in top_load]
        selected_ticker = [ticker_dict[x] for x in ticker_num]

        return selected_ticker
Exemplo n.º 4
0
def cars_pca(cars, num_comp, gauss_std=0):
    cars_ansc = cars_anscombe(cars, gauss_std)
    cars_ansc2d = tran4d_2d(cars_ansc)

    pca_tran2 = pca(n_components=int(num_comp))
    cars_pca1 = pca_tran2.fit_transform(cars_ansc2d.s1)
    cars_pca1 = pca_tran2.inverse_transform(cars_pca1)
    cars_pca2 = pca_tran2.fit_transform(cars_ansc2d.s2)
    cars_pca2 = pca_tran2.inverse_transform(cars_pca2)

    cars_pca_v = EmptyClass()
    cars_pca_v.s1 = np.reshape(
        cars_pca1,
        (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d))
    cars_pca_v.s2 = np.reshape(
        cars_pca2,
        (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d))

    cars_pcainv = cars_invanscombe(cars_pca_v, gauss_std)

    pcacars = EmptyClass()
    pcacars.s1 = np.reshape(
        cars_pcainv.s1,
        (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d))
    pcacars.s2 = np.reshape(
        cars_pcainv.s2,
        (cars_ansc2d.a, cars_ansc2d.b, cars_ansc2d.c, cars_ansc2d.d))

    return pcacars
Exemplo n.º 5
0
 def plot_data(self, samples, fig_name):
     # plot training data
     # pca
     # fit the normalized dataset to a pca object, and reduce dimensions from 16 to 2
     fig = plt.figure()
     normalized_samples = data_prep.normalize_data(samples, self.col_names)
     y_pred_samples = self.get_predictions(
         data_prep.get_scaled_data(samples, self.col_names,
                                   self.scale_method))
     pca_obj = pca(n_components=2)
     pca_transformed = pandas.DataFrame(
         pca_obj.fit_transform(normalized_samples))
     plt.scatter(pca_transformed[y_pred_samples == -1][0],
                 pca_transformed[y_pred_samples == -1][1],
                 label='Outlier',
                 c='red')
     plt.scatter(pca_transformed[y_pred_samples == 1][0],
                 pca_transformed[y_pred_samples == 1][1],
                 label='Inlier',
                 c='blue')
     ts = dt.now().strftime('%Y%m%d-%H%M%S')  # Use timestamp as file id
     plt.legend(loc=2)
     plt.title(
         'PCA Plot of %d Samples\n kernel = RBF nu = %s gamma = %s\nDimension reduction from %d to 2'
         % (len(y_pred_samples), str(self.nu_parameter),
            str(self.gamma_parameter), len(self.col_names)))
     plt.savefig('../data/diagrams/%s_%s.png' % (fig_name, ts),
                 bbox_inches='tight')
def PCA(*args, n_comp=None):
    '''
    takes a tuple of arrays and applies pca to alll of them
    returns concatenated version of array with pca applied
    '''
    from sklearn.decomposition import PCA as pca
    d = np.concatenate(*args, axis=0)
    pc = pca(n_components=n_comp)
    return pc.fit_transform(d)
Exemplo n.º 7
0
def mypca(filename1, number):
    number = int(number)
    x, y = data_format(filename1)
    pcafif = pca(n_components=number, whiten=False, random_state=7)
    pcafif.fit(x)
    result = pcafif.explained_variance_ratio_
    list = result.tolist()
    json_str = json.dumps(list)
    return (json_str)
Exemplo n.º 8
0
def plot_cluster_pca(
    Xmat,
    Xcluster_label=None,
    metric="euclidean",
    dimpca=2,
    whiten=True,
    isprecompute=False,
    savefile="",
    doreturn=1,
):
    """
    :return:
    """

    from sklearn.decomposition import pca

    if isprecompute:
        Xmat_dist = Xmat
    else:
        Xmat_dist = sci.spatial.distance.squareform(
            sci.spatial.distance.pdist(Xmat,
                                       metric=metric,
                                       p=dimpca,
                                       w=None,
                                       V=None,
                                       VI=None))

    model = pca(n_components=dimpca, whiten=whiten)
    X_pca = model.fit_transform(Xmat)

    # plot the result
    xx, yy = X_pca[:, 0], X_pca[:, 1]
    if Xcluster_label is None:
        Yclass = np.zeros(X_pca.shape[0])
    else:
        Yclass = Xcluster_label

    plot_XY(xx,
            yy,
            zcolor=Yclass,
            labels=Yclass,
            color_dot="plasma",
            savefile=savefile)

    if doreturn:
        return X_pca
Exemplo n.º 9
0
def find_pca_comp(cars, gauss_std=0):
    cars_ansc = cars_anscombe(cars, gauss_std)
    cars_ansc2d = tran4d_2d(cars_ansc)

    cars_diff = cars_ansc2d.s2 - cars_ansc2d.s1

    # PCA carried out on the difference data first to choose the number of components to keep
    pca_tran = pca()
    pca_tran.fit_transform(cars_diff)
    diff_pca_var = pca_tran.explained_variance_ratio_

    plt.figure()
    plt.title('Variance contribution of principle components')
    plt.xlabel('Principle component no.')
    plt.ylabel('Variance contribution (between 0-1)')
    plt.plot(diff_pca_var)
    plt.draw()
    plt.show(block=False)
Exemplo n.º 10
0
def analysis(df):

    scaled_df = preprocessing.scale(df)

    pc = pca()

    try:

        pc.fit(scaled_df)

    except:

        pc.fit(scaled_df)

    pca_data = pc.transform(scaled_df)

    per_var = np.round(pc.explained_variance_ratio_ * 100, decimals=1)

    labels = ['PC' + str(x) for x in range(1, len(per_var) + 1)]

    plt.bar(x=range(1, len(per_var) + 1), height=per_var, tick_label=labels)
    plt.ylabel('Percentage of Explained Variance')
    plt.xlabel('Principal Component')
    plt.title('Scree Plot')
    plt.show()

    pca_df = pd.DataFrame(pca_data, columns=labels)

    plt.scatter(pca_df.PC1, pca_df.PC2)
    plt.title('My PCA Graph')
    plt.xlabel('PC1 - {0}%'.format(per_var[0]))
    plt.ylabel('PC2 - {0}%'.format(per_var[1]))

    for sample in pca_df.index:
        plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))

    plt.show()

    print(pc.components_[0])
Exemplo n.º 11
0
 def _get_values(self, pop):
     nsp = pop.numSubPop()
     all_alleles = []
     for subpop in range(nsp):
         for ind in pop.individuals(subPop=subpop):
             geno = ind.genotype()
             n_markers = len(geno) // 2
             for mi in range(n_markers):
                 if len(all_alleles) <= mi:
                     all_alleles.append(set())
                 a1 = geno[mi]
                 a2 = geno[mi + n_markers]
                 all_alleles[mi].add(a1)
                 all_alleles[mi].add(a2)
     for i, alleles in enumerate(all_alleles):
         all_alleles[i] = sorted(list(alleles))
     inds = defaultdict(list)
     for mi in range(n_markers):
         for subpop in range(nsp):
             for i, ind in enumerate(pop.individuals(subPop=subpop)):
                 geno = ind.genotype()
                 a1 = geno[mi]
                 a2 = geno[mi + n_markers]
                 for a in all_alleles[mi]:
                     inds[(subpop, i)].append([a1, a2].count(a))
     ind_order = sorted(list(inds.keys()))
     arr = []
     for ind in ind_order:
         arr.append(inds[ind])
     my_pca = pca(n_components=2)
     X = np.array(arr)
     my_pca.fit(X)
     X_r = my_pca.transform(X)
     my_components = {}
     for i, ind in enumerate(ind_order):
         my_components[ind] = X_r[i]
     return my_components
    Method = input(
        "Choose Method: Enter PCA for Principal Component Analysis, EM for Probabilistic PCA and KERNEL for Kernel Method: "
    )
    Dimension = int(input("Give Dimensionality of Projection:"))
    M = Dimension

    if Method == 'PCA':
        PCA(X, y, M)
    elif Method == 'EM':
        PPCA(X, y, M)
    else:
        KERNEL(X, y, M)

    # --- PCA with Built-in python class

    from sklearn.decomposition import PCA as pca

    n_components = Dimension
    my_pca = pca(n_components)

    Projected_Data = my_pca.fit_transform(X.T).T

    if Dimension == 2:
        Practical_Plots(Projected_Data)
    else:
        D_Plots(Projected_Data)

print("\n\n*****____End of Process____*****\n\n")

Time = time.process_time()
print(Time)
}]

hospital_data = merged_data.append(newrows, ignore_index=True, sort=False)
hospital_data.shape

#Using numerical columns, conduct PCA and obtain the eigenvalues
##Define numerical columns:
hospital_data.dtypes
hospital_data.describe(include=['number'])
##PCA
hospital_reduct_pca = hospital_data[[
    'HospitalID', 'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp',
    'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds',
    'Compensation', 'MaxTerm'
]]
pca_result = pca(n_components=10).fit(hospital_reduct_pca)

#Obtain eigenvalues
pca_result.explained_variance_

#Components from the PCA
pca_result.components_

#Scree plot
plt.figure(figsize=(7, 5))
plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], pca_result.explained_variance_ratio_,
         '-o')
plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
Exemplo n.º 14
0
 def __init__(self, m, **kwargs):
     self.model = pca(n_components=m, **kwargs)
Exemplo n.º 15
0
    #data.get_dataframe()
    #corr_dat = pd.concat([corr_dat, data.data_frame])
    print('file %i' % file)

    n_components = 2

    for taste in range(4):
        this_off = data.normal_off_firing[taste]
        this_off = this_off[:, :, 80:160]
        total_off = this_off[0, :, :]
        for nrn in range(1, this_off.shape[0]):
            total_off = np.concatenate((total_off, this_off[int(nrn), :, :]),
                                       axis=1)

        reduced_off_pca = pca(n_components=15).fit(total_off)
        print(sum(reduced_off_pca.explained_variance_ratio_))
        reduced_off = reduced_off_pca.transform(total_off)
        gmm = GaussianMixture(n_components=n_components,
                              covariance_type='full',
                              n_init=200).fit(reduced_off)
        print(gmm.predict(reduced_off))

        this_groups = gmm.predict(reduced_off)
        trial_order = np.argsort(this_groups)

        # Pull out and cluster distance matrices
        this_dist = off_stim_dists[taste]
        clust_dist = this_dist[trial_order, :]
        clust_dist = clust_dist[:, trial_order]
Exemplo n.º 16
0
"""
Genetic Algorithm
"""

from __future__ import division
import numpy as np
from sklearn.metrics import silhouette_samples as score
from feng.Genetic.mutation import mutation
from feng.Genetic.selection import selection
from feng.Genetic.crossover import crossover
from feng.Genetic import utils
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD as pca
import pickle
trn_term_doc = pickle.load(open('../data2/tfidf.txt', 'rb'))
p=pca(n_components=5000,random_state=42)

y=pickle.load(open('../data2/label.txt', 'rb'))
trn_term_doc=p.fit_transform(trn_term_doc,y)
pickle.dump(trn_term_doc,open('../data2/pca.txt', 'wb'))
size=trn_term_doc.shape[1]

def aimFunction(list):
    r=[]
    for index,l in enumerate(list):
        if l=="1":
            r.append(trn_term_doc.getcol(index))
    temp=hstack(r)
    s=score(temp,y)
    print(s)
    return s
Exemplo n.º 17
0
rows, columns = reduce_data.shape
reduce_data.columns
reduce_data.dtypes
reduce_data.head()
reduce_data['Teaching'] = reduce_data['Teaching'].astype('category')
reduce_data['TypeControl'] = reduce_data['TypeControl'].astype('category')
reduce_data['DonorType'] = reduce_data['DonorType'].astype('category')

reduce_data_pca = reduce_data[[
    'NoFTE', 'NetPatRev', 'InOperExp', 'OutOperExp', 'OperRev', 'OperInc',
    'AvlBeds', 'Compensation', 'MaxTerm'
]]
sc = StandardScaler()
reduce_data_std = sc.fit_transform(reduce_data_pca)

pca_result = pca(n_components=9).fit(reduce_data_std)
pca_result.explained_variance_
pca_result.components_.T * np.sqrt(pca_result.explained_variance_)

plt.figure(figsize=(7, 5))
plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9], pca_result.explained_variance_ratio_,
         '-o')
plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0.25, 4.25)
plt.ylim(0, 1.05)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9])

###Factor Analysis###

reduce_data_fac = reduce_data[[
Exemplo n.º 18
0
"""
@ Filename:       PCA_TEST.py
@ Author:         Danc1elion
@ Create Date:    2019-06-03   
@ Update Date:    2019-06-03 
@ Description:    Implement PCA_TEST
"""

from DimensionReduction import PCA
import numpy as np
from sklearn.decomposition import PCA as pca
import time

data = np.array([[2.5, 2.4], [0.5, 0.7], [2.2, 2.9], [1.9, 2.2], [3.1, 3.0],
                 [2.3, 2.7], [2, 1.6], [1, 1.1], [1.5, 1.6], [1.1, 0.9]])
time_start1 = time.time()
clf1 = PCA()
clf1.train(data)
print(clf1.transformData(data))
time_end1 = time.time()
print("Runtime of PCA:", time_end1 - time_start1)

time_start2 = time.time()
clf1 = pca(1)
x = clf1.fit_transform(data)
print(x)
time_end2 = time.time()
print("Runtime of sklearn PCA:", time_end2 - time_start2)
Exemplo n.º 19
0
n_classes = target_names.shape[0]

print("Total dataset size:")
print("n_samples: %d" % n_samples)
print("n_features: %d" % n_features)
print("n_classes: %d" % n_classes)

# #############################################################################
# Compute a PCA (eigenmris) on the mri dataset (treated as unlabeled
# dataset): unsupervised feature extraction / dimensionality reduction
n_components = 25

print("Extracting the top %d eigenmris from %d mris"
      % (n_components, X_train.shape[0]))
t0 = time()
PCA = pca(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)

print("done in %0.3fs" % (time() - t0))

eigenmris = PCA.components_.reshape((n_components, h*16, w*16))

print("Projecting the input data on the eigenmris orthonormal basis")
t0 = time()
X_train_pca = PCA.transform(X_train)

print("done in %0.3fs" % (time() - t0))


clf = SVC(C=50, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
Exemplo n.º 20
0
    print(Eigen_Data.shape)
    #     a=Original_Data.values
    #     b=Eigen_Data.values
    c = np.dot(Original_Data.values, Eigen_Data.values)
    Reduced_Data = pd.DataFrame(c)
    return Reduced_Data


Dimension_Reduction_PCA = Reduced_Data(Train.iloc[:, 0:1024], Eigen_PCA)
Dimension_Reduction_PCA = pd.concat([Dimension_Reduction_PCA, Train['Label']],
                                    axis=1)

# In[33]:

from sklearn.decomposition import PCA as pca
lda1 = pca(n_components=2)
inbuilt_pca = lda1.fit_transform(Train)

# In[35]:

inbuilt_pca = pd.DataFrame(inbuilt_pca)
inbuilt_pca = pd.concat([inbuilt_pca, Train['Label']], axis=1)

# In[39]:

import matplotlib.pyplot as plt
c = [
    'Red', "blue", 'green', 'orange', 'yellow', 'purple', 'black', 'magenta',
    'navy', 'skyblue', 'pink', 'brown'
]
for i in range(11):
Exemplo n.º 21
0
from sklearn.cluster import KMeans

k = 32

kmeans = KMeans(n_clusters = k)
kmeans.fit(X)

df = df[:len(vector_list)]

df['clusters'] = kmeans.labels_

centroids = pd.DataFrame(kmeans.cluster_centers_)

from sklearn.decomposition import RandomizedPCA as pca
pca = pca(n_components=2)
Y = centroids.as_matrix()
pca.fit(Y)
subspace = pd.DataFrame(pca.transform(Y),columns=["x","y"])

euclidean_distance = []

hog_points = pd.DataFrame(X)

for i in range(len(df)):
    tmp = hog_points.loc[i].as_matrix()
    cluster_integer = int(df.clusters.loc[i])
    
    euclidean_distance_i = np.linalg.norm(tmp - centroids.loc[cluster_integer].as_matrix())
    euclidean_distance.append(euclidean_distance_i)
    
Exemplo n.º 22
0
pd.set_option('precision', 19)
print(temp)
# calcualte_tf('1.csv',task0_output_dir)
ndarr = df.T.to_numpy()[0]
np.savetxt("foo.csv", ndarr, delimiter=",")
def pca(feature_matrix,k):

    pca = PCA(n_components=k)
    principalComponents = pca.fit_transform(feature_matrix)
    print(pca.explained_variance_ratio_)
    principalDf = pd.DataFrame(data=principalComponents)
    data_scaled = pd.DataFrame(preprocessing.scale(df), columns=df.columns)
    word_score_df = pd.DataFrame(pca.components_, columns=data_scaled.columns)
    print(word_score_df.iloc[0].sort_values(ascending=False))
    # print(principalDf)
    return 0

def svd(feature_matrix,k):

    svd = TruncatedSVD(n_components=k)
    components = svd.fit_transform(feature_matrix)
    print(svd.explained_variance_ratio_)
    principalDf = pd.DataFrame(data=components)
    # print(principalDf)
    return 0

pca(df,5)

# svd(df,5)

Exemplo n.º 23
0
    def __init__(self, A):
        scaled = preprocessing.StandardScaler().fit(A).transform(A)
        pca_res = pca().fit(scaled) 

        self.U = pca_res.components_.transpose() # loadings
        self.P = pca_res.transform( scaled )     # scores
Exemplo n.º 24
0
stimulus = 2000
identity_center = stimulus + 500
palatability_center = stimulus + 1000
window_radius = 125

iden_firing = dat.all_normalized_firing[...,(identity_center - window_radius)//dt:(identity_center + window_radius)//dt]
pal_firing = dat.all_normalized_firing[...,(palatability_center - window_radius)//dt:(palatability_center + window_radius)//dt]

def imshow(array):
    plt.imshow(array,interpolation='nearest',aspect='auto')

iden_firing_long = np.reshape(iden_firing,(iden_firing.shape[0],-1))
pal_firing_long = np.reshape(pal_firing,(pal_firing.shape[0],-1))

n_components = 3
red_iden_obj = pca(n_components = n_components).fit(iden_firing_long.T)
red_pal_obj = pca(n_components = n_components).fit(pal_firing_long.T)

# Plot eigenvectors for both states
fig,ax = plt.subplots(2)
ax[0].imshow(red_iden_obj.components_)
ax[1].imshow(red_pal_obj.components_)
plt.show()

# Matrix multiplication of both martices is dot product
# of all pairs of eigenvectors
orthogonal_distance = np.matmul(red_iden_obj.components_,
                        red_pal_obj.components_.T)

#fig,ax=plt.subplots(3)
#plt.sca(ax[0])
Exemplo n.º 25
0
testData = np.array(
    pd.read_table(os.path.join(DAT, 'test.txt'),
                  header=None,
                  encoding='gb2312',
                  delim_whitespace=True))
train_y = trainData[:, -1]
train_x = np.delete(trainData, -1, axis=1)
test_y = testData[:, -1]
test_x = np.delete(testData, -1, axis=1)

time_start1 = time.time()
clf1 = PCA()
clf1.train(train_x)
train_x = clf1.transformData(train_x)
test_x = clf1.transformData(test_x)
clf = LogisticRegression(solver='liblinear', multi_class='ovr')
clf.fit(train_x, train_y)
print("Accuracy of PCA:", clf.score(test_x, test_y))
time_end1 = time.time()
print("Runtime of PCA:", time_end1 - time_start1)

time_start2 = time.time()
clf2 = pca(n_components=1)
train_x = clf2.fit_transform(train_x)
test_x = clf2.fit_transform(test_x, test_y)
clf = LogisticRegression(solver='liblinear', multi_class='ovr')
clf.fit(train_x, train_y)
print("Accuracy of sklearn PCA:", clf.score(test_x, test_y))
time_end2 = time.time()
print("Runtime of sklearn PCA:", time_end2 - time_start2)
Exemplo n.º 26
0
def pca_analysis(dataset,
                 dropcols=[],
                 imputenans=True,
                 scale=True,
                 rem_outliers=True,
                 out_thresh=10,
                 n_components=5,
                 existing_model=False,
                 model_file='Optional'):
    """Performs a primary component analysis on an input dataset

    Parameters
    ----------
    dataset : pandas.core.frame.DataFrame, shape (n, p)
        Input dataset with n samples and p features
    dropcols : list
        Columns to exclude from pca analysis. At a minimum, user must exclude
        non-numeric columns.
    imputenans : bool
        If True, impute NaN values as column means.
    scale : bool
        If True, columns will be scaled to a mean of zero and a standard
        deviation of 1.
    n_components : int
        Desired number of components in principle component analysis.

    Returns
    -------
    pcadataset : diff_classifier.pca.Bunch
        Contains outputs of PCA analysis, including:
        scaled : numpy.ndarray, shape (n, p)
            Scaled dataset with n samples and p features
        pcavals : pandas.core.frame.DataFrame, shape (n, n_components)
            Output array of n_component features of each original sample
        final : pandas.core.frame.DataFrame, shape (n, p+n_components)
            Output array with principle components append to original array.
        prcomps : pandas.core.frame.DataFrame, shape (5, n_components)
            Output array displaying the top 5 features contributing to each
            principle component.
        prvals : dict of list of str
            Output dictionary of of the pca scores for the top 5 features
            contributing to each principle component.
        components : pandas.core.frame.DataFrame, shape (p, n_components)
            Raw pca scores.

    """
    pd.options.mode.chained_assignment = None  # default='warn'
    dataset_num = dataset.drop(dropcols, axis=1)
    dataset_num = dataset_num.replace([np.inf, -np.inf], np.nan)

    if rem_outliers:
        for i in range(10):
            for col in dataset_num.columns:
                xmean = np.mean(dataset_num[col])
                xstd = np.std(dataset_num[col])

                counter = 0
                for x in dataset_num[col]:
                    if x > xmean + out_thresh * xstd:
                        dataset[col][counter] = np.nan
                        dataset_num[col][counter] = np.nan
                    if x < xmean - out_thresh * xstd:
                        dataset[col][counter] = np.nan
                        dataset_num[col][counter] = np.nan
                    counter = counter + 1

    dataset_raw = dataset_num.values

    # Fill in NaN values
    if imputenans:
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        imp.fit(dataset_raw)
        dataset_clean = imp.transform(dataset_raw)
    else:
        dataset_clean = dataset_raw

    # Scale inputs
    if scale:
        if existing_model:
            scaler = model_file.scaler
            dataset_scaled = model_file.scaler.transform(dataset_clean)
        else:
            scaler = stscale()
            scaler.fit(dataset_clean)
            dataset_scaled = scaler.transform(dataset_clean)
    else:
        dataset_scaled = dataset_clean

    pcadataset = Bunch(scaled=dataset_scaled)

    if existing_model:
        pca1 = model_file.pcamodel
    else:
        pca1 = pca(n_components=n_components)
        pca1.fit(dataset_scaled)

    if not existing_model:
        # Cumulative explained variance ratio
        cum_var = 0
        explained_v = pca1.explained_variance_ratio_
        print('Cumulative explained variance:')
        for i in range(0, n_components):
            cum_var = cum_var + explained_v[i]
            print('{} component: {}'.format(i, cum_var))

    prim_comps = {}
    pcadataset.prvals = {}
    comps = pca1.components_
    pcadataset.components = pd.DataFrame(comps.transpose())
    for num in range(0, n_components):
        highest = np.abs(
            pcadataset.components[num]).values.argsort()[-5:][::-1]
        pels = []
        pcadataset.prvals[num] = pcadataset.components[num].values[highest]
        for col in highest:
            pels.append(dataset_num.columns[col])
        prim_comps[num] = pels

    # Main contributors to each primary component
    pcadataset.prcomps = pd.DataFrame.from_dict(prim_comps)
    pcadataset.pcavals = pd.DataFrame(pca1.transform(dataset_scaled))
    pcadataset.final = pd.concat([dataset, pcadataset.pcavals], axis=1)
    pcadataset.pcamodel = pca1
    pcadataset.scaler = scaler

    return pcadataset
Exemplo n.º 27
0
day3_data = np.asarray(data3.normal_off_firing)[:,day3_nrns,:,0:269].swapaxes(-1,-2)

all_data = np.concatenate((day1_data,day3_data[:,:,:,:32]),axis=0)
# =============================================================================
#  Take means of data and generate coordinate transformation based on that
# =============================================================================
day1_mean = np.mean(day1_data,axis=-1)
day3_mean = np.mean(day3_data,axis=-1)

all_data_mean = np.concatenate((day1_mean,day3_mean),axis=0)

all_data_long = all_data_mean[0,:,:]
for taste in range(1,all_data_mean.shape[0]):
    all_data_long = np.concatenate((all_data_long,all_data_mean[taste,:,:]),axis=-1)

all_mean_red_pca = pca(n_components = 3).fit(all_data_long.T)
all_mean_red = all_mean_red_pca.transform(all_data_long.T)

# Convert mean data back to array
all_mean_red_array = np.zeros((all_data_mean.shape[0],all_mean_red.shape[1],all_data_mean.shape[2]))
all_mean_red_list = np.split(all_mean_red,6)
for taste in range(len(all_mean_red_list)):
    all_mean_red_array[taste,:,:] = all_mean_red_list[taste].T
    
# Smooth mean data
smooth_mean_dat = np.zeros(all_mean_red_array.shape)
for taste in range(smooth_mean_dat.shape[0]):
    for dim in range(smooth_mean_dat.shape[1]):
        smooth_mean_dat[taste,dim,:] = scipy.ndimage.filters.gaussian_filter(all_mean_red_array[taste,dim,:],1)

# Use same transformation to reduce single trials
total_dat = np.asarray(total_dat)
total_dat_long = total_dat[0, :, :]
for comp in range(1, n_components):
    total_dat_long = np.concatenate((total_dat_long, total_dat[comp, :, :]),
                                    axis=0)

# ____                 _
#|  _ \ __ _ _ __   __| | ___  _ __ ___
#| |_) / _` | '_ \ / _` |/ _ \| '_ ` _ \
#|  _ < (_| | | | | (_| | (_) | | | | | |
#|_| \_\__,_|_| |_|\__,_|\___/|_| |_| |_|
#
# ____            _           _   _
#|  _ \ _ __ ___ (_) ___  ___| |_(_) ___  _ __
#| |_) | '__/ _ \| |/ _ \/ __| __| |/ _ \| '_ \
#|  __/| | | (_) | |  __/ (__| |_| | (_) | | | |
#|_|   |_|  \___// |\___|\___|\__|_|\___/|_| |_|
#              |__/

# Compare with PCA
pca_transformer = pca(n_components=2)
pca_data = pca_transformer.fit_transform(total_dat_long)
plt.subplot(211)
plt.scatter(pca_data[:, 0], pca_data[:, 1])

# Random projection
transformer = sparse_random(n_components=2)
X_new = transformer.fit_transform(total_dat_long)
plt.subplot(212)
plt.scatter(X_new[:, 0], X_new[:, 1])
data.get_data()
data.get_firing_rates()


# =============================================================================
# =============================================================================
## Visualize change in representation because of Opto
opto = np.asarray(np.sort([1,2]*60))

all_firing = np.concatenate([data.all_normal_off_firing,data.all_normal_on_firing], axis=1)
all_firing = all_firing[:,:,80:200]
all_firing_long = all_firing[0,:,:]
for nrn in range(1,all_firing.shape[0]):
    all_firing_long = np.concatenate((all_firing_long,all_firing[int(nrn),:,:]),axis=1)
    
all_reduced_pca = pca(n_components = 15).fit(all_firing_long)
all_reduced = all_reduced_pca.transform(all_firing_long)

# =============================================================================
# plt.plot(all_reduced_pca.explained_variance_ratio_/np.max(all_reduced_pca.explained_variance_ratio_),'-o')
# plt.xlabel('PCA number');plt.ylabel('Variance Explained Ratio')
# 
# plt.set_cmap('viridis')
# 
# plt.figure()
# plt.scatter(all_reduced[:,0],all_reduced[:,1],
#                c =opto, s=20)
# plt.colorbar()
# plt.xlabel('PCA1');plt.ylabel('PCA2')
# 
# 
pg_data.dtypes
pg_data.columns
pg_data.head()

#################################################
#==========Principal Component Analysis=========#
# Perform a PCA for PU, PEOU, and Intention	    #
#################################################

reduc_data_pca = reduc_data[[
    'peruse01', 'peruse02', 'peruse03', 'peruse04', 'peruse05', 'peruse06',
    'pereou01', 'pereou02', 'pereou03', 'pereou04', 'pereou05', 'pereou06',
    'intent01', 'intent02', 'intent03'
]]

pca_result = pca(n_components=15).fit(reduc_data_pca)

#Obtain eigenvalues
pca_result.explained_variance_

#Components from the PCA
pca_result.components_

plt.figure(figsize=(7, 5))
plt.plot([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
         pca_result.explained_variance_ratio_, '-o')
plt.ylabel('Proportion of Variance Explained')
plt.xlabel('Principal Component')
plt.xlim(0.75, 4.25)
plt.ylim(0, 1.05)
plt.xticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
    off_firing = data.normal_off_firing

    print('file %i' % file)

    n_components = 2

    for taste in range(4):
        stim_off = data.normal_off_firing[taste]
        stim_off = stim_off[:, :, stimulus_inds]
        total_stim_off = stim_off[0, :, :]
        for nrn in range(1, stim_off.shape[0]):
            total_stim_off = np.concatenate(
                (total_stim_off, stim_off[int(nrn), :, :]), axis=1)

        reduced_stim_pca = pca(n_components=10).fit(total_stim_off)
        #print(sum(reduced_stim_pca.explained_variance_ratio_))
        reduced_stim = reduced_stim_pca.transform(total_stim_off)

        gmm = GaussianMixture(n_components=n_components,
                              covariance_type='full',
                              n_init=500).fit(reduced_stim)
        #print(gmm.predict(reduced_stim))

        groups = gmm.predict(reduced_stim)
        all_groups.append(sum(groups))
        trial_order = np.argsort(groups)

        # Train LDA classifier on firing from both clusters

        repeats = 500
def pca_cal(dataset, labels, attNames, **kwargs):
    p = pca(n_components=2)
    trained = p.fit_transform(dataset)
    plot(trained, labels, attNames, **kwargs)