def analyze_index(data):

    data = data[["Mexico", *features_GDT]].dropna()

    Y = data['Mexico'].apply(lambda x: x > data['Mexico'].mean())
    # X = SimpleImputer(strategy='mean').fit_transform(data[features_GDT])
    X = np.hstack([
        StandardScaler().fit_transform(feature[:, None])
        for feature in np.array(data[features_GDT]).T
    ])

    stepwise = RFE(estimator=LogisticRegression(solver='lbfgs'))
    stepwise.fit(X, Y)

    print('ranking')
    print(list(sorted(zip(stepwise.ranking_, features_GDT))))

    Y_pred = stepwise.predict(X)
    print(accuracy_score(Y, Y_pred))
    print(confusion_matrix(Y_pred, Y))

    model_fa = FactorAnalysis()
    model_fa.fit_transform(X)
    loadings = varimax(model_fa.components_[:6].T)
    print(loadings)

    factors = []
    for loading in (loadings > .7).T:
        factors.append([
            feature for feature, included in zip(features_GDT, loading)
            if included
        ])
    print(factors)
Exemplo n.º 2
0
def model_process(X, y):
    """
    调用训练模型进行数据处理
    :param X: 自变量
    :param y: 因变量
    :return: result
    """
    fa = FactorAnalysis()
    fa.fit_transform(X, y)
    # print fa.get_covariance()
    print fa.components_
Exemplo n.º 3
0
def run_FA(X,y,title):
    
    fa = FA(random_state=5)
    fa.fit_transform(X)
    vn = fa.noise_variance_
    print(vn)
    plt.plot(list(range(len(vn))), vn, 'm-')
    plt.xlabel('conponent')
    plt.ylabel('noise variance')
    plt.tick_params('y', colors='m')
    plt.title("FA Noise Variance: "+ title)
    plt.show()
def factor_analyses(results_dir):
	data_array = np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=',')
	fa1 = FactorAnalysis(n_components = 1)
	new_array_gbm = fa1.fit_transform(np.transpose(data_array[range(15)]))
	print new_array_gbm.shape
	fa2 = FactorAnalysis(n_components = 1)
	new_array_tree = fa2.fit_transform(np.transpose(data_array[range(41,51) + range(54,64)]))
	print new_array_tree.shape

	fa3 = FactorAnalysis(n_components = 1)
	new_array_lin = fa3.fit_transform(np.transpose(data_array[range(27,41) + range(51,54)]))

	fa4 = FactorAnalysis(n_components = 1)
	new_array_knn = fa4.fit_transform(np.transpose(data_array[range(16,27)]))

	datasets = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'datasets.csv'), 'r').readlines()]
	methods = [line.rstrip('\n') for line in open(os.path.join(results_dir, 'methods.csv'), 'r').readlines()]
	figure()
	pretty_scatter(new_array_tree, [1 for x in range(115)], data_array[46], 200*np.ones(new_array_tree.shape), ['' for d in datasets])
	xlabel('Dimension 1')
	ylabel('Arbitrary Dimension 2')
	colorbar()

	figure()

	plot(new_array_lin, new_array_tree, 'bo')
	xlabel('Linear')
	ylabel('Tree + RF')

	figure()
	subplot(2,2,1)
	scatter(new_array_gbm, new_array_tree)
	xlabel('GBM')
	ylabel('Tree + RF')

	#figure()
	subplot(2,2,2)
	scatter(new_array_knn, new_array_tree)
	xlabel('KNN')
	ylabel('Tree + RF')

	#figure()
	subplot(2,2,3)
	scatter(new_array_knn, new_array_lin)
	xlabel('KNN')
	ylabel('Linear')

	subplot(2,2,4)
	scatter(new_array_gbm, new_array_lin)
	xlabel('GBM')
	ylabel('Linear')
	show()
    def train_FA(self, components, feature_patches, zero_pad, feature_map_shape, stride, **kwargs):
        """
        Function to compute Factor Analysis (FA) of input patches.
        :param components: The requested components (list)
        :param feature_patches: Input feature patches to compute FA on them [patches, height, width]
        Warning: This function modifies the feature_patches argument!
        :param zero_pad: Boolean parameter to indicate if feature maps are zero-padded
        :param feature_map_shape: Shape of the feature map [batch, height, width]
        :param stride: Stride used in generation of patches
        :return: FA feature extractor (a single object) and extracted features [batch, height, width, channel].
        """

        # Check if kernel mode is not enabled while FA is the function to extract features.
        assert not self.cfg["kernel_mode"], 'Kernel mode is not supported for FA.'

        # Define an instance of FA dimensionality reduction
        fa = FactorAnalysis(n_components=max(components) + 1, copy=False, max_iter=self.cfg["max_iteration_FA"])

        # Reshape the the input data to a 2D array (an array of 1D input data)
        feature_patches_shape = feature_patches.shape
        feature_patches = np.reshape(feature_patches,
                                     (feature_patches.shape[0], feature_patches.shape[1] * feature_patches.shape[2]))

        # Fit the Factor Analysis model and extract the new feature maps
        extracted_features = fa.fit_transform(feature_patches)

        # Reshape the extracted patch from [batch * height * width, channel] to [batch, height, width, channel]
        extracted_features = reshape_feature_vector(extracted_features, feature_patches_shape, zero_pad,
                                                    feature_map_shape, stride)

        # Extract the requested components
        extracted_features = extracted_features[:, :, :, components]

        return fa, extracted_features
Exemplo n.º 6
0
Arquivo: main.py Projeto: nerdslab/DAD
def compute_V(Y, X_test, T_test, d=3, methods=['PCA', 'MDS', 'FA', 'Isomap']):
    L = len(methods)
    V = []
    for idx in range(L):
        if methods[idx] == 'PCA':
            pca = PCA(n_components=d)
            V.append(pca.fit_transform(Y))
        elif methods[idx] == 'MDS':
            mds = MDS(n_components=d)
            V.append(mds.fit_transform(Y))
        elif methods[idx] == 'FA':
            fa = FactorAnalysis(n_components=d)
            V.append(fa.fit_transform(Y))
        elif methods[idx] == 'Isomap':
            isomap = Isomap(n_components=d)
            V.append(isomap.fit_transform(Y))

    plt.figure(figsize=(8, 6))
    plt.subplot(1, 2, 1)
    utils.color_data(X_test, T_test)
    plt.title('Ground Truth')
    plt.subplot(1, 2, 2)
    utils.color_data(V[0], T_test)
    plt.title('FA')
    plt.show()

    return V
Exemplo n.º 7
0
def fa_profiles(df, control_wells, wells, plates, components=60):
    control_X = df.loc[df.Well.isin(control_wells)].values[:, 5:].astype(float)
    fa = FactorAnalysis(n_components=components)
    control_X_reduc = fa.fit_transform(control_X)
    # Compute hat matrix for the MAP estimate
    A = fa.components_.T
    mu = fa.mean_
    sigma = np.diag(fa.noise_variance_)
    A_hat = A.T @ np.linalg.inv((A @ A.T) + sigma)

    reagent_profiles_dict = {}
    for well in wells:
        plate_profiles = []
        for plate in plates:
            reagent_X = df.loc[df.Well.eq(well)
                               & df.Plate.eq(plate)].values[:,
                                                            5:].astype(float)
            v = np.mean(reagent_X, axis=0) - mu
            plate_profiles.append(A_hat @ v)

        well_profile = np.median(np.array(plate_profiles), axis=0)
        reagent_profiles_dict[well] = well_profile

    return pd.DataFrame.from_dict(
        reagent_profiles_dict, orient='index', columns=list(
            range(components))).reset_index().rename(columns={'index': 'Well'})
def fit_factor_analysis(percentage=0.8):
    """
    Runs the factor analysis.

    Parameters:

        percentage: float, default:0.8

        The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis.

    Returns:
        
        X: array of floats [n_samples,n_factors]

            The transformed data after the factor analysis.

        components: array of floats [n_factors,n_samples]

            The components of the factor analysis
    """
    fa = FactorAnalysis()
    fa.fit(data)
    C = fa.get_covariance()
    l,e = np.linalg.eigh(C)
    cs = np.cumsum(l[::-1])/np.sum(l)
    n = np.sum(cs<percentage)

    fa.n_components = n
    X_ = fa.fit_transform(data)
    components = fa.components_
    return X_,components
Exemplo n.º 9
0
def d_lfa():
    # 潜在因素分析 去掉了pca 的正交约束

    lfa = FactorAnalysis(n_components=2)
    X_lfa = lfa.fit_transform(iris.data)
    plt.scatter(X_lfa[:, 0], X_lfa[:, 1], c=iris.target, edgecolors="none")
    plt.show()
Exemplo n.º 10
0
def embarked_onehot(df):
    """
    Transforms embarked into OneHotEncoder columns.

    :param df: Input DataFrame that contains 'Embarked' column

    :return: df
    :rtype: pandas DataFrame
    """

    # TODO make it foolproof if number of categories in train is different from test and etc
    # TODO better treat NaNs

    # Does basically the same thing as OneHotEncoder
    embarked_df = pd.get_dummies(df.Embarked)

    # Merges DataFrames (OneHotEncoder and DataFrame being processed)
    ndf = pd.concat([df, embarked_df], axis=1)

    fa = FactorAnalysis(n_components=1)
    y = fa.fit_transform(embarked_df.values)
    ndf['embarked_fa'] = y

    # Returns transformed DataFrame
    return ndf
Exemplo n.º 11
0
def gridsearch_svm(Xtrain, Ytrain, Xval, Yval):
    #---------------------------------- Scaling
    X1, scaler = scale_data(Xtrain)
    X2 = scale_data(Xval, scaler)
    #---------------------------------- Factor analysis
    fa = FactorAnalysis()
    X1 = fa.fit_transform(X1)
    X2 = fa.fit(X2)
    #---------------------------------- Cross validation and grid search
    cv = ShuffleSplit(len(Xtrain),
                      n_iter=1,
                      train_size=0.25,
                      test_size=.03,
                      random_state=0)
    params = {'C': [1, 10], 'kernel': ['rbf', 'linear']}
    svr = svm.SVC(verbose=True, shrinking=False)
    classifier = grid_search.GridSearchCV(svr, params, verbose=3, cv=cv)
    t0 = time()
    classifier.fit(X1, Ytrain)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)
    #---------------------------------- Prediction on validation set:
    t0 = time()
    pred = list(classifier.predict(X2))
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    if hasattr(classifier, 'coef_'):
        print("dimensionality: %d" % classifier.coef_.shape[1])
        print("density: %f" % density(classifier.coef_))
    print 'F1-score : ', f1_score(Yval, pred, average='binary')
    print("classification report:")
    print(classification_report(Yval, pred, target_names=['0', '1'], digits=4))
    print("confusion matrix:")
    print(confusion_matrix(Yval, pred))
    return classifier, scaler
Exemplo n.º 12
0
def plot_clusters(Data1, Data2, l1, l2, fn):

    x = np.hstack((Data1[0], Data2[0]))
    y = np.hstack((Data1[1], Data2[1]))
    z = np.hstack((Data1[2], Data2[2]))
    X = np.matrix((x, y, z))
    Y = stats.zscore(np.array(X), axis=1)
    X = np.matrix(Y)
    pca = PCA(n_components=2)
    pca.fit(X.T)
    h = np.hstack((np.zeros(Data1.shape[1]), np.ones(Data2.shape[1])))
    #sz = 10*np.ones(Data1.shape[1] + Data2.shape[1])

    sns.set_theme(style="darkgrid", font_scale=2)
    f, ax = plt.subplots(figsize=(7, 5))

    g = sns.scatterplot(x=x, y=y, hue=h, s=300, legend=False, palette=[l1, l2])
    g.set(xlim=(0.5, 1))
    g.set(ylim=(0.5, 1))

    cx = np.array([np.mean(Data1[0]), np.mean(Data2[0])])
    cy = np.array([np.mean(Data1[1]), np.mean(Data2[1])])
    ch = np.array([0, 1])
    dist = np.sqrt((cx[0] - cx[1])**2 + (cx[0] - cx[1])**2)

    g = sns.scatterplot(x=cx,
                        y=cy,
                        hue=ch,
                        s=500,
                        legend=False,
                        marker='X',
                        palette=[l1, l2])
    ax1 = g.axes
    ax1.plot(cx, cy, color='black', linewidth=5.0)
    ax1.text(
        np.mean(cx) + 0.01,
        np.mean(cy) - 0.01, "d=" + str(round(dist, 2)))
    ax1.set(ylabel="Angle Symmetry")
    ax1.set(xlabel="Pose Symmetry")
    #g.legend_.remove()

    transformer = FactorAnalysis(n_components=2,
                                 random_state=0,
                                 rotation='varimax')
    X_transformed = transformer.fit_transform(X.T)
    print(transformer.components_)
    fn2 = fn[0:-4] + '_FA_' + fn[-4:]
    PlotHeatMapGrid(transformer.components_.T, ['Component 1', 'Component 2'],
                    Labels2=['Distance', 'Angle', 'Drift'],
                    fn=fn2,
                    size=(7, 6))
    #components = X_transformed.components_.T

    print('Eig: ', pca.explained_variance_ratio_)
    print('EV: ', pca.components_)

    if (fn != None):
        f.savefig("..//Figures//" + fn, dpi=600, bbox_inches='tight')
    else:
        plt.show(block=False)
Exemplo n.º 13
0
def cabin_first_letter(df):
    # Does basically the same thing as OneHotEncoder
    cabin_df = pd.get_dummies(df['Cabin'].str[0], prefix='cabin_first_letter', dummy_na=True)
    cabin_df['cabin_first_letter_other'] = 0

    columns = ['cabin_first_letter_A', 'cabin_first_letter_B', 'cabin_first_letter_C', 'cabin_first_letter_D',
               'cabin_first_letter_E', 'cabin_first_letter_F', 'cabin_first_letter_G', 'cabin_first_letter_T',
               'cabin_first_letter_nan', 'cabin_first_letter_other']

    # Code below harmonizes OneHotEncoder between train and test sets
    for column in cabin_df.columns:
        if column not in columns:
            cabin_df['cabin_first_letter_other'] = cabin_df['cabin_first_letter_other'] + cabin_df[column]
            cabin_df.drop(column=column)

    for column in columns:
        if column not in cabin_df.columns:
            cabin_df[column] = 0

    cabin_df = cabin_df[columns]
    fa = FactorAnalysis(n_components=1)
    y = fa.fit_transform(cabin_df.values)

    # Merges DataFrames (OneHotEncoder and DataFrame being processed)
    ndf = pd.concat([df, cabin_df], axis=1)
    ndf['cabin_fa'] = y

    return ndf
Exemplo n.º 14
0
def factor_analysis(results_dir):
	data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=','))
	fa = FactorAnalysis(n_components = 2)
	new_array = fa.fit_transform(data_array)
	print fa.get_covariance().shape
	print new_array
	np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
Exemplo n.º 15
0
class FactorAnalysis():
    def __init__(self, cols, n_components):
        self.n_components = n_components
        self.model = FactorAnalysis(n_components=n_components)
        self.columns = cols

    def fit(self, data):
        self.model.fit(data[self.columns])

    def fit_transform(self, data):
        transformed = self.model.fit_transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["fa_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data

    def transform(self, data):
        transformed = self.model.transform(data[self.columns])
        transformed = pd.DataFrame(
            transformed,
            columns=["fa_" + str(i + 1) for i in range(self.n_components)])
        data = pd.concat([data, transformed], axis=1)
        data = data.drop(self.columns, axis=1)
        return data
 def reduceDataset(self,nr=3,method='PCA'):
     '''It reduces the dimensionality of a given dataset using different techniques provided by Sklearn library
      Methods available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     #dataset=self.dataset[Model.in_columns]
     #dataset=self.dataset[['Humidity','TemperatureF','Sea Level PressureIn','PrecipitationIn','Dew PointF','Value']]
     #PCA
     if method=='PCA':
         sklearn_pca = sklearnPCA(n_components=nr)
         reduced = sklearn_pca.fit_transform(dataset)
     #Factor Analysis
     elif method=='FactorAnalysis':
         fa=FactorAnalysis(n_components=nr)
         reduced=fa.fit_transform(dataset)
     #kernel pca with rbf kernel
     elif method=='KPCArbf':
         kpca=KernelPCA(nr,kernel='rbf')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with poly kernel
     elif method=='KPCApoly':
         kpca=KernelPCA(nr,kernel='poly')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with cosine kernel
     elif method=='KPCAcosine':
         kpca=KernelPCA(nr,kernel='cosine')
         reduced=kpca.fit_transform(dataset)
     #kernel pca with sigmoid kernel
     elif method=='KPCAsigmoid':
         kpca=KernelPCA(nr,kernel='sigmoid')
         reduced=kpca.fit_transform(dataset)
     #ICA
     elif method=='IPCA':
         ipca=IncrementalPCA(nr)
         reduced=ipca.fit_transform(dataset)
     #Fast ICA
     elif method=='FastICAParallel':
         fip=FastICA(nr,algorithm='parallel')
         reduced=fip.fit_transform(dataset)
     elif method=='FastICADeflation':
         fid=FastICA(nr,algorithm='deflation')
         reduced=fid.fit_transform(dataset)
     elif method == 'All':
         self.dimensionalityReduction(nr=nr)
         return self
     
     self.ModelInputs.update({method:reduced})
     self.datasetsAvailable.append(method)
     return self
Exemplo n.º 17
0
def factor_analysis(df, group_value=3):
    # check group_value range
    if group_value <= 1 or group_value > len(df.columns):
        print("Group value has to be between 1 and number of columns. "
              "Change to use default group value 3. ")
        group_value = 3
    
    try:
        # round group_value in case the parameter is float
        group_value = int(round(group_value))
    
        # transform df using factor analysis
        fa = FactorAnalysis(n_components = group_value, random_state=0)
        df_fa = fa.fit_transform(df)
        
        plt.figure()
        plt.title('Factor Analysis Components')
        
        for i in range(group_value):
            for j in range(i+1, group_value):
                plt.scatter(df_fa[:,i], df_fa[:,j])
        
        plt.show()
        
        return df_fa
    except ValueError:
        print("Skipped n_component = " + str(group_value) + " since i > min(n_samples, n_features).")
Exemplo n.º 18
0
def plot_compare(X_compare, S_compare, S_ica):

    pca = PCA(n_components=3)
    S_pca = pca.fit_transform(X_compare)

    fa = FactorAnalysis(n_components=3)
    S_fa = fa.fit_transform(X_compare)

    models = [X_compare, S_compare, S_ica, S_pca, S_fa]
    names = [
        'Observations (mixed signal)', 'True Sources',
        'FastICA recovered IC signals', 'PCA recovered IC signals',
        'Factor Analysis recovered IC signals'
    ]
    colors = ['red', 'steelblue', 'orange']

    plt.figure(figsize=(10, 6))
    for ii, (model, name) in enumerate(zip(models, names),
                                       1):  # enumerate starts from 1
        plt.subplot(5, 1, ii)
        plt.title(name)
        for sig, color in zip(model.T, colors):
            plt.plot(sig, color=color)
            plt.xticks([])
            plt.yticks([])
    plt.subplots_adjust(0.09, 0.09, 0.94, 0.94, 0.5, 1)
 def dimensionalityReduction(self,nr=5):
     '''It applies all the dimensionality reduction techniques available in this class:
     Techniques available:
                         'PCA'
                         'FactorAnalysis'
                         'KPCArbf','KPCApoly'
                         'KPCAcosine','KPCAsigmoid'
                         'IPCA'
                         'FastICADeflation'
                         'FastICAParallel'
                         'Isomap'
                         'LLE'
                         'LLEmodified'
                         'LLEltsa'
     '''
     dataset=self.ModelInputs['Dataset']
     sklearn_pca = sklearnPCA(n_components=nr)
     p_components = sklearn_pca.fit_transform(dataset)
     fa=FactorAnalysis(n_components=nr)
     factors=fa.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='rbf')
     rbf=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='poly')
     poly=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='cosine')
     cosine=kpca.fit_transform(dataset)
     kpca=KernelPCA(nr,kernel='sigmoid')
     sigmoid=kpca.fit_transform(dataset)
     ipca=IncrementalPCA(nr)
     i_components=ipca.fit_transform(dataset)
     fip=FastICA(nr,algorithm='parallel')
     fid=FastICA(nr,algorithm='deflation')
     ficaD=fip.fit_transform(dataset)
     ficaP=fid.fit_transform(dataset)
     '''isomap=Isomap(n_components=nr).fit_transform(dataset)
     try:
         lle1=LocallyLinearEmbedding(n_components=nr).fit_transform(dataset)
     except ValueError:
         lle1=LocallyLinearEmbedding(n_components=nr,eigen_solver='dense').fit_transform(dataset)
     try:
         
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified').fit_transform(dataset)
     except ValueError:
         lle2=LocallyLinearEmbedding(n_components=nr,method='modified',eigen_solver='dense').fit_transform(dataset) 
     try:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa').fit_transform(dataset)
     except ValueError:
         lle3=LocallyLinearEmbedding(n_components=nr,method='ltsa',eigen_solver='dense').fit_transform(dataset)'''
     values=[p_components,factors,rbf,poly,cosine,sigmoid,i_components,ficaD,ficaP]#,isomap,lle1,lle2,lle3]
     keys=['PCA','FactorAnalysis','KPCArbf','KPCApoly','KPCAcosine','KPCAsigmoid','IPCA','FastICADeflation','FastICAParallel']#,'Isomap','LLE','LLEmodified','LLEltsa']
     self.ModelInputs.update(dict(zip(keys, values)))
     [self.datasetsAvailable.append(key) for key in keys ]
     
     #debug
     #dataset=pd.DataFrame(self.ModelInputs['Dataset'])
     #dataset['Output']=self.ModelOutput
     #self.debug['Dimensionalityreduction']=dataset
     ###
     return self
Exemplo n.º 20
0
def decomp(data, data_vecs, labels):
    model = FactorAnalysis(n_components=3)
    reduced_data = model.fit_transform(data_vecs)

    for i, r in enumerate(reduced_data):
        print(r)
        print(data[i])
        print('\n-----------------------------------\n')
Exemplo n.º 21
0
def factoranal(input,finaldim):
    import numpy
    if isinstance(input, numpy.ndarray) == False:
        input = input.todense()
    from sklearn.decomposition import FactorAnalysis
    fa=FactorAnalysis(n_components=finaldim)
    res=fa.fit_transform(input )
    return fa.components_.transpose(), res
def find_FA(data_X, data_Y, filename, est_name):
    for NUM_ATTR in range(1, data_X.shape[1] + 1):
        fa = FactorAnalysis(n_components=NUM_ATTR,
                            random_state=37,
                            max_iter=3000)
        reduced_FA = fa.fit_transform(data_X)
        select_comp_supervised(reduced_FA, data_Y, filename, NUM_ATTR,
                               est_name)
Exemplo n.º 23
0
def sibsparch_fa(df):
    sibsparch_df = df[['SibSp', 'Parch']]
    fa = FactorAnalysis(n_components=1)
    y = fa.fit_transform(sibsparch_df.values)

    df['sibsparch_fa'] = y

    return df
Exemplo n.º 24
0
def FA(dataset, traget_dimension):
    converter = FactorAnalysis(n_components=target_dimension, random_state=0)
    fa_data = converter.fit_transform(dataset.to_numpy())
    reduced_columns = [
        'nf{}'.format(str(i + 1)) for i in range(traget_dimension)
    ]
    result_data = pd.DataFrame(X_FA, columns=columns)
    return result_data
def factor_analysis(y_mat, num_components):
    from sklearn.decomposition import FactorAnalysis
    F = FactorAnalysis(num_components)
    transformed = F.fit_transform(
        y_mat.transpose())  # shape: time x components
    components = F.components_
    mn = F.mean_
    noise_variance = F.noise_variance_
    return transformed, components, mn, noise_variance
Exemplo n.º 26
0
    def _fit(self, X, n_components, sample_size):
        self._reset()
        Y = np.delete(X, range(0,14), axis=1) #delete column numbers:1 included - 13 excluded, the knob columns are deleted
        np.random.shuffle(Y) #Shuffled only by the rows, by default
        Y=Y[:sample_size,:]#sample only 1000 rows from the matrix
        Y=Y.transpose()
        print("Shape before:", Y.shape)
        model= FactorAnalysis(n_components=n_components, random_state=0)
        model.fit_transform(Y)
        self.model_=model
        print(self.model_.components_.shape) #metrics X factors
        #filtering out any components with 0 values
        self.model_.components_=self.model_.components_.transpose()
        components_mask = np.sum(self.model_.components_ != 0.0, axis=1) > 0.0
        self.components_ = self.model_.components_[components_mask]
        print("Shape after:",self.components_.shape)

        return self
Exemplo n.º 27
0
def factorAnalysis(data, percentage=0.535):
    dataMat = np.array(data)
    newData, meanVal = zeroMean(data)  #equalization
    covMat = covArray(newData)  #covariance matrix
    eigVals, eigVects = featureMatrix(covMat)
    n_components = percentage2n(eigVals, percentage)
    clf = FactorAnalysis(n_components=n_components)
    new_data = clf.fit_transform(dataMat)
    return new_data
Exemplo n.º 28
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(
        n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0],
                    Z[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' %
                  (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0],
                    Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0],
                    factor_analysis_Zhat[ids == id, 1],
                    color=colors[id - 1],
                    s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Exemplo n.º 29
0
def FAforAllworkloads(n_c, frame):
    all_metrics_data = frame.values

    all_metrics_data_Trans = all_metrics_data.T
    tmp_all_transformer = FactorAnalysis(n_components=n_c, random_state=0)
    tmp_workload_A_transformed = tmp_all_transformer.fit_transform(
        all_metrics_data_Trans)

    return tmp_workload_A_transformed
Exemplo n.º 30
0
def dimension_reduction(train_x, train_y, test_x, n_col, method='fact'):
    # Obtain column names
    attr_list = train_x.columns

    # Using RFE to rank feactures and then select
    if method == 'RFE':
        # Using RFE to rank attributes
        lin_reg = LinearRegression()
        rfe = RFE(lin_reg, n_col)
        fit = rfe.fit(train_x, train_y)

        # Selecte most relevant attributes for machien learning
        fit_list = fit.support_.tolist()
        indexes = [
            index for index in range(len(fit_list)) if fit_list[index] == True
        ]

        # Print out attributes selected and ranking
        print('\nAttributes selected are: ', itemgetter(*indexes)(attr_list))
        print('\nAttributes Ranking: ', fit.ranking_)

        train_x_returned = train_x.iloc[:, indexes]
        test_x_returned = test_x.iloc[:, indexes]

    # Using factor analysis
    elif method == 'fact':
        fact_anal = FactorAnalysis(n_components=n_col)
        train_x_returned = pd.DataFrame(fact_anal.fit_transform(train_x))
        test_x_returned = pd.DataFrame(fact_anal.transform(test_x))

        train_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(train_x_returned.columns)
        ]
        test_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(test_x_returned.columns)
        ]

    # Using PCA
    elif method == 'PCA':
        pca_down = PCA(n_components=n_col)
        train_x_returned = pd.DataFrame(pca_down.fit_transform(train_x))
        test_x_returned = pd.DataFrame(pca_down.transform(test_x))

        train_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(train_x_returned.columns)
        ]
        test_x_returned.columns = [
            ''.join(['feature_', str(i)])
            for i in list(test_x_returned.columns)
        ]

    # Returned selected or regenerated features
    return train_x_returned, test_x_returned
Exemplo n.º 31
0
def factor_dim(df):
    #主成份分析
    pmodel = PCA(n_components=3)
    lower_mat = pmodel.fit_transform(df)
    df_array   = df.values[:]
    lower_df = DataFrame(lower_mat,columns=["factor1","factor2","factor3"])
    #因子分析
    fmodel =FactorAnalysis (n_components=3,random_state=0)
    lower_fac = fmodel.fit_transform(df)
    #lower_df = DataFrame(lower_fac,columns=["factor1","factor1","factor1"])
    print(lower_df)
    return lower_df
def do_fa(df):
    columns = [
        "cement", "slag", "fly_ash", "water", "superplasticizer",
        "coarse_aggregate", "fine_aggregate"
    ]
    X = df[columns]
    X_std = StandardScaler().fit_transform(X)

    fa = FactorAnalysis(n_components=4, random_state=100)
    X_fa = fa.fit_transform(X_std)
    fa_summary = pd.DataFrame(fa.components_, columns=columns)
    print(fa_summary)
    fa_plot(X_fa[:, 0:2], np.transpose(fa.components_[0:2, :]), columns)
Exemplo n.º 33
0
class Fa(Preprocess):
    """
	因子分析クラスです

	"""
    def __init__(self):
        super().__init__()

    def make_parser(self):
        parser = super().make_parser()
        parser.add_argument("--n_components",
                            dest="n_components",
                            default=2,
                            type=int)
        #parser.add_argument("-t","--target_colname",dest="target_colname",default=None,type=str)
        return parser

    def set_parsed_args_unique(self, parsed):
        self.n_components = parsed.n_components

    def parse_args(self, args):
        parser = self.make_parser()
        return parser.parse_args(args)

    def fa(self, data):
        self.model = FactorAnalysis(cols=self.columns,
                                    n_components=self.n_components)
        transformed = self.model.fit_transform(data)
        return transformed

    def main(self, args):
        parsed = self.parse_args(args)
        self.set_parsed_args_common(parsed)
        self.set_parsed_args_unique(parsed)

        data = self.read_data()
        self.columns = self.get_col_list()
        #主成分分析
        data.data = self.fa(data.data)
        #前処理をフローに追加
        data.add_preprocess(self.model)
        """
		#変換規則のファイル出力
		with open(self.temp_files_path+"pca.pickle","wb") as f:
			pickle.dump(self.model,f)

		#前処理の順番を保存
		self.write_order()
		"""
        #独立成分データセット出力
        self.write_data(data)
def factor_analysis( data ):
    fa = FactorAnalysis()
    features = numerical_features + categorical_features
    fa_data = fa.fit_transform( data[features] )
    plt.figure()
    plt.subplot(2,2,0)
    plt.scatter( fa_data[:,0], fa_data[:,1], c=data[target] )
    plt.subplot(2,2,1)
    plt.scatter( fa_data[:,2], fa_data[:,3], c=data[target] )
    plt.subplot(2,2,2)
    plt.scatter( fa_data[:,4], fa_data[:,5], c=data[target] )
    plt.subplot(2,2,3)
    plt.scatter( fa_data[:,6], fa_data[:,7], c=data[target] )
    return fa_data
Exemplo n.º 35
0
    def fit(self, X, y, truncated=None):
        print("Computing M: (%i × %i)" % (self.nb_users, self.nb_works))
        matrix = self.make_matrix(X, y)

        model = FactorAnalysis(n_components=self.NB_COMPONENTS)
        matrix = matrix.toarray()
        self.matrix = matrix
        if truncated is not None:
            matrix = matrix[:, :truncated]
        self.W = model.fit_transform(matrix)
        self.H = model.components_
        print('Shapes', self.W.shape, self.H.shape)
        self.M = self.W.dot(self.H) + model.mean_
        self.model = model

        self.chrono.save('factor matrix')
Exemplo n.º 36
0
def factoran(data: ndarray, n_factors: int = 2, demean=True, scale=True):
    from sklearn.decomposition import FactorAnalysis

    data = data.copy()
    if demean:
        data -= data.mean(axis=0)
    if scale:
        data /= data.std(axis=0, ddof=1)

    fa = FactorAnalysis(n_components=n_factors)
    # validation
    # from sklearn.model_selection import cross_val_score
    # cross_val_score(fa, data).mean()
    scores = fa.fit_transform(data)
    coeffs = np.sqrt(0.5) * fa.components_.T
    return coeffs, scores
Exemplo n.º 37
0
def testAlgorithm():
    import matplotlib.pyplot as plt

    random.seed(35)
    np.random.seed(32)

    n = 200
    d = 20
    k = 2
    sigma = .3
    n_clusters = 3
    decay_coef = .1

    X, Y, Z, ids = generateSimulatedDimensionalityReductionData(n_clusters, n, d, k, sigma, decay_coef)

    Zhat, params = block_ZIFA.fitModel(Y, k)
    colors = ['red', 'blue', 'green']
    cluster_ids = sorted(list(set(ids)))
    model = FactorAnalysis(n_components=k)
    factor_analysis_Zhat = model.fit_transform(Y)

    plt.figure(figsize=[15, 5])

    plt.subplot(131)
    for id in cluster_ids:
        plt.scatter(Z[ids == id, 0], Z[ids == id, 1], color=colors[id - 1], s=4)
        plt.title('True Latent Positions\nFraction of Zeros %2.3f' % (Y == 0).mean())
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])

    plt.subplot(132)
    for id in cluster_ids:
        plt.scatter(Zhat[ids == id, 0], Zhat[ids == id, 1], color=colors[id - 1], s=4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('ZIFA Estimated Latent Positions')
        # title(titles[method])

    plt.subplot(133)
    for id in cluster_ids:
        plt.scatter(factor_analysis_Zhat[ids == id, 0], factor_analysis_Zhat[ids == id, 1], color = colors[id - 1], s = 4)
        plt.xlim([-4, 4])
        plt.ylim([-4, 4])
        plt.title('Factor Analysis Estimated Latent Positions')

    plt.show()
Exemplo n.º 38
0
def initialize(trials, params, config):
    """Make skeleton"""
    # TODO: fast initialization for large dataset
    from sklearn.decomposition import FactorAnalysis

    zdim = params["zdim"]
    xdim = params["xdim"]

    # TODO: use only a subsample of trials?
    y = np.concatenate([trial["y"] for trial in trials], axis=0)
    subsample = np.random.choice(y.shape[0], max(y.shape[0] // 10, 50))
    ydim = y.shape[-1]
    fa = FactorAnalysis(n_components=zdim, random_state=0)
    z = fa.fit_transform(y[subsample, :])
    a = fa.components_
    b = np.log(np.maximum(np.mean(y, axis=0, keepdims=True), config["eps"]))
    noise = np.var(y[subsample, :] - z @ a, ddof=0, axis=0)

    # stupid way of update
    # two cases
    # 1) no key
    # 2) empty value (None)
    if params.get("a") is None:
        params.update(a=a)
    if params.get("b") is None:
        params.update(b=b)
    if params.get("noise") is None:
        params.update(noise=noise)

    for trial in trials:
        length = trial["y"].shape[0]

        if trial.get("mu") is None:
            trial.update(mu=fa.transform(trial["y"]))

        if trial.get("x") is None:
            trial.update(x=np.ones((length, xdim, ydim)))

        trial.update({"w": np.zeros((length, zdim)), "v": np.zeros((length, zdim))})
Exemplo n.º 39
0
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA, FastICA, FactorAnalysis

rng = np.random.RandomState(42)
s = rng.normal(scale=0.01,size=(4,1000))
S = np.ones((3,1000))
S[0] = s[0]
S[1] = s[1]
S[2] = s[0]+s[1]

pca = PCA()
S_pca_ = pca.fit_transform(S.T)

fa = FactorAnalysis(svd_method="lapack")
S_fa_ = fa.fit_transform(S.T)

ica = FastICA(max_iter=20000, tol=0.00001)
S_ica_ = ica.fit_transform(S.T)  # Estimate the sources

def plot_3d(data, ax, axis_list=None):
	data /= np.std(data)
	ax.scatter(data[0] ,data[1], data[2] , s=2, marker='o', zorder=10, color='steelblue', alpha=0.5)
	ax.set_xlim(-4, 4)
	ax.set_ylim(-4, 4)
	ax.set_zlim(-4, 4)
	ax.set_xlabel('x')
	ax.set_ylabel('y')
	ax.set_zlabel('z')
	for label in (ax.get_xticklabels() + ax.get_yticklabels() + ax.get_zticklabels()):
		label.set_fontsize(6)
pca = decomposition.PCA()
sub_pca_prime = pca.fit_transform(sub_pca_imputed) 

pca.n_components_  # the estimated number of components
pca.components_  # principal component loadings
pca.explained_variance_ratio_ # percentage of variance explained by each principal components
pca.explained_variance_ratio_.cumsum()  # cumulative sum of percentage of variance explained


# Factor Analysis
GSS = pd.read_csv("GSS_Cum.csv")
sub = GSS.ix[:,'confinan':'conarmy']

# impute missing value in DataFrame sub
from sklearn import preprocessing
impute = preprocessing.Imputer()
sub_imputed = impute.fit_transform(sub)

# use FactorAnalysis package 
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components = 5, max_iter = 100) #Here we set dimensionality of latent space to be 5 and maximum number of iterations to be 100
sub_fa = fa.fit_transform(sub_imputed)

fa.components_  # factor loadings
fa.loglike_   # the log likelihood at each iteration
fa.n_iter_   # Number of iterations run



Exemplo n.º 41
0
def initalizeParams(Y, k, method = 'standard'):
	"""
	initializes parameters. 
	By default, (method set to "standard") initializes using a mixture model. 
	If method is set to "high_dimensional", first does dimensionality reduction using factor analysis 
	and then clusters the low-dimensional data. 
	Checked.
	"""
	assert(method in ['high_dimensional', 'standard'])
	if method == 'high_dimensional':
		N, D = Y.shape
		#initialize using factor analysis. 
		model = FactorAnalysis(n_components = 5)
		low_dim_Y = model.fit_transform(Y)
		kmeans_model = KMeans(n_clusters = k)
		z = kmeans_model.fit_predict(low_dim_Y)
		cluster_mus = np.zeros([D, k])
		cluster_weights = np.zeros([k,])
		cluster_sigmas = np.zeros([D, k])
		
		for z_i in sorted(set(z)):
			idxs = (z == z_i)
			cluster_weights[z_i] = np.mean(idxs)
			cluster_Y = Y[idxs, :]
			cluster_Y_is_nonzero = np.abs(cluster_Y) > 1e-6
			cluster_mus[:, z_i] = cluster_Y.sum(axis = 0) / cluster_Y_is_nonzero.sum(axis = 0)
			
			cluster_sigmas[:, z_i] = np.sqrt(((cluster_Y ** 2).sum(axis = 0) - 2 * cluster_mus[:, z_i] * (cluster_Y.sum(axis = 0)) + cluster_mus[:, z_i]**2 * cluster_Y_is_nonzero.sum(axis = 0)) / cluster_Y_is_nonzero.sum(axis = 0))
			for j in range(1, 5):
				assert(np.abs(cluster_sigmas[j, z_i] - np.std(cluster_Y[cluster_Y_is_nonzero[:, j], j])) < 1e-4)		
		
		
	if method == 'standard':
		N, D = Y.shape
		model = GMM(n_components = k)
		imputedY = deepcopy(Y)
		for j in range(D):
			non_zero_idxs = np.abs(Y[:, j]) > 1e-6
			for i in range(N):
				if Y[i][j] == 0:
					imputedY[i][j] = np.random.choice(Y[non_zero_idxs, j])
		model.fit(imputedY)
		cluster_mus = model.means_.transpose()
		cluster_weights = model.weights_
		cluster_sigmas = np.sqrt(model.covars_.transpose())
		
	#now fit decay coefficient
	means = []
	ps = []
	for j in range(D):
		non_zero_idxs = np.abs(Y[:, j]) > 1e-6
		means.append(Y[non_zero_idxs, j].mean())
		ps.append(1 - non_zero_idxs.mean())
	
	
	decay_coef, pcov = curve_fit(exp_decay, means, ps)
	mse = np.mean(np.abs(ps - np.exp(-decay_coef * (np.array(means) ** 2))))
	print 'Decay Coef is %2.3f; MSE is %2.3f' % (decay_coef, mse)
	
	decay_coef = decay_coef[0]

	
	assert(np.all(cluster_sigmas > 0))
	return cluster_mus, cluster_sigmas, cluster_weights, decay_coef
Exemplo n.º 42
0
import sys
from sklearn.decomposition import FactorAnalysis
from sklearn.datasets import load_svmlight_file, dump_svmlight_file

if __name__ == "__main__":
    svm_file = sys.argv[1]
    dim = int(sys.argv[2])
    fa = FactorAnalysis(
        n_components=dim, 
        tol=0.01, 
        copy=False,
        max_iter=1000, 
        verbose=3, 
        noise_variance_init=None,
    )

    X, y = load_svmlight_file(svm_file, zero_based = False, query_id = False)
    X_new = fa.fit_transform(X.toarray(), y)

    dump_svmlight_file(X_new, y, "%s.fa%d" % (svm_file, dim), zero_based = False)



Exemplo n.º 43
0
# X = np.dot(S, A.T)  # Generate observations

rng = np.random.RandomState(42)
S = rng.normal(scale=0.01,size=(10000, 2))
S[:,1][::2] *= 1.7
S[:,0][::2] /= 1.7
S[:,1][1::2] /= 1.7
S[:,0][1::2] *= 1.7
X=deepcopy(S)
X[:,1] = X[:,0]/-2+X[:,1]

pca = PCA()
S_pca_ = pca.fit_transform(X)

fa = FactorAnalysis(svd_method="lapack")
S_fa_ = fa.fit_transform(X)

ica = FastICA(max_iter=20000, tol=0.00001)
S_ica_ = ica.fit_transform(X)  # Estimate the sources


###############################################################################
# Plot results

def plot_samples(S, axis_list=None):
    plt.scatter(S[:, 0], S[:, 1], s=2, marker='o', zorder=10,
                color='steelblue', alpha=0.5)
    if axis_list is not None:
        colors = ['orange', 'red']
        for color, axis in zip(colors, axis_list):
            axis /= axis.std()
Exemplo n.º 44
0
def base(
    use_filter="default",
    data_path="~/data/faons/latest.csv",
    filter_name="default.csv",
    participant_subset="",
    drop_metadata=True,
    drop=[],
    clean=7,
    components=5,
    facecolor="#ffffff",
):

    data_path = path.expanduser(data_path)
    filter_path = path.join(path.dirname(path.realpath(__file__)), "filters", filter_name)

    filters = pd.read_csv(
        filter_path, index_col=0, header=None
    ).transpose()  # transpose filters because of .csv file formatting, specify index_col to not get numbered index
    all_data = pd.read_csv(data_path)

    all_data = all_data[map(lambda y: len(set(y)) > clean, np.array(all_data))]

    # drops metadata
    if drop_metadata == True:
        all_data = all_data.drop(filters["metadata"][pd.Series.notnull(filters["metadata"])], axis=1)

        # compile list of column names to be dropped:
    drop_list = []
    for drop_item in drop:
        drop_list += list(filters[drop_item][pd.Series.notnull(filters[drop_item])])
    drop_list = list(
        set(drop_list)
    )  # get unique column names (the list may contain duplicates if overlaying multiple filters)
    all_data = all_data.drop(drop_list, axis=1)

    if participant_subset == "odd":
        keep_rows = all_data.index.values[1::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "even":
        keep_rows = all_data.index.values[0::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "male":
        filtered_data = all_data[all_data["My legal gender:"] == "Male"]
    elif participant_subset == "female":
        filtered_data = all_data[all_data["My legal gender:"] == "Female"]
    else:
        filtered_data = all_data

        # convert to correct type for analysis:
    filtered_data_array = np.array(filtered_data, dtype="float64")

    filtered_data_array = filtered_data_array / 100

    pca = PCA()
    S_pca_ = pca.fit_transform(filtered_data_array)

    fa = FactorAnalysis(svd_method="lapack")
    S_fa_ = fa.fit_transform(filtered_data_array)

    ica = FastICA(n_components=components, max_iter=20000, tol=0.00001)
    S_ica_ = ica.fit_transform(filtered_data_array)  # Estimate the sources

    load = ica.mixing_

    remapped_cmap = remappedColorMap(
        cm.PiYG,
        start=(np.max(load) - abs(np.min(load))) / (2 * np.max(load)),
        midpoint=abs(np.min(load)) / (np.max(load) + abs(np.min(load))),
        name="shrunk",
    )
    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor)
    graphic = ax.imshow(load, cmap=remapped_cmap, interpolation="none")
def compute_FA(df):
    FA = FactorAnalysis()
    return FA.fit_transform(df)
Exemplo n.º 46
0
    preds.append([])
    certainty.append([])
    # each network has a vote in that cross validation fold
    for s in range(len(seeds)):
        X = np.vstack([np.array(g1_fmri[s]), np.array(g2_fmri[s])])
        y = np.array(labels)
        X = preprocessing.scale(X)

        print 'seed %d: cv %d/%d'%(s+1,oidx+1,nobs)
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]
        c_val_scores = []
        dimred = FactorAnalysis(n_components=20)
        X_train = dimred.fit_transform(X_train)
        X_test = dimred.transform(X_test)
        for c in cs:
            inner_preds = []
            clf = LogisticRegression(C=c, penalty="l1", dual=False, class_weight='auto')
            for iidx, (itrain, itest) in enumerate(inner_cv):
                X_inner_train = X_train[itrain]
                X_val = X_train[itest]
                y_inner_train = y_train[itrain]
                y_val = y_train[itest]
                scaler = preprocessing.StandardScaler().fit(X_inner_train)
                X_inner_train = scaler.transform(X_inner_train)
                X_val = scaler.transform(X_val)
                clf.fit(X_inner_train, y_inner_train)
                inner_preds.append(clf.predict(X_val))
            c_val_scores.append(f1_score(y_train, inner_preds, pos_label=1))
Exemplo n.º 47
0
#For example least 98 percent of the variance 98%的能量
pca = decomposition.PCA(n_components=.98)
iris_X_prime = pca.fit(iris_X)
pca.explained_variance_ratio_.sum()
#1.0


#Using factor analysis for decomposition 因子分析降维

#Factor analysis is another technique we can use to reduce dimensionality. However, factor
#analysis makes assumptions and PCA does not. The basic assumption is that there are
#implicit features responsible for the features of the dataset.

from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components=2)
iris_two_dim = fa.fit_transform(iris.data)
iris_two_dim[:5]
#array([[-1.33125848, 0.55846779],
#[-1.33914102, -0.00509715],
#[-1.40258715, -0.307983 ],
#[-1.29839497, -0.71854288],
#[-1.33587575, 0.36533259]])


#Kernel PCA for nonlinear dimensionality reduction

#产生非线性数据
import numpy as np
A1_mean = [1, 1]
A1_cov = [[2, .99], [1, 1]]
A1 = np.random.multivariate_normal(A1_mean, A1_cov, 50)