def factor_analysis(results_dir):
	data_array = np.transpose(np.genfromtxt(os.path.join(results_dir,'summary.csv'),delimiter=','))
	fa = FactorAnalysis(n_components = 2)
	new_array = fa.fit_transform(data_array)
	print fa.get_covariance().shape
	print new_array
	np.savetxt(os.path.join(results_dir,'FA-datasets-2.csv'), new_array, delimiter=',')
def fit_factor_analysis(percentage=0.8):
    """
    Runs the factor analysis.

    Parameters:

        percentage: float, default:0.8

        The percentage of the cumulative sum of the eigenvalues to be held. This number defines the number of loading factors in the analysis.

    Returns:
        
        X: array of floats [n_samples,n_factors]

            The transformed data after the factor analysis.

        components: array of floats [n_factors,n_samples]

            The components of the factor analysis
    """
    fa = FactorAnalysis()
    fa.fit(data)
    C = fa.get_covariance()
    l,e = np.linalg.eigh(C)
    cs = np.cumsum(l[::-1])/np.sum(l)
    n = np.sum(cs<percentage)

    fa.n_components = n
    X_ = fa.fit_transform(data)
    components = fa.components_
    return X_,components
Exemplo n.º 3
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise
    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))
    with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter('always', ConvergenceWarning)
        fa1.max_iter = 1
        fa1.verbose = True
        fa1.fit(X)
        assert_true(w[-1].category == ConvergenceWarning)

        warnings.simplefilter('always', DeprecationWarning)
        FactorAnalysis(verbose=1)
        assert_true(w[-1].category == DeprecationWarning)
Exemplo n.º 4
0
def test_factor_analysis():
    """Test FactorAnalysis ability to recover the data covariance structure
    """
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) \
                * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    fa = FactorAnalysis(n_components=n_components)
    fa.fit(X)
    X_t = fa.transform(X)
    assert_true(X_t.shape == (n_samples, n_components))

    assert_almost_equal(fa.loglike_[-1], fa.score(X).sum())

    # Make log likelihood increases at each iteration
    assert_true(np.all(np.diff(fa.loglike_) > 0.))

    # Sample Covariance
    scov = np.cov(X, rowvar=0., bias=1.)

    # Model Covariance
    mcov = fa.get_covariance()
    diff = np.sum(np.abs(scov - mcov)) / W.size
    assert_true(diff < 0.1, "Mean absolute difference is %f" % diff)

    fa = FactorAnalysis(n_components=n_components,
                        noise_variance_init=np.ones(n_features))
    assert_raises(ValueError, fa.fit, X[:, :2])
Exemplo n.º 5
0
def selectedm_criteria_test():
    N = 100
    n = 10
    m = 3
    sigma2 = 0.1
    mu = 0
    [X, Y, A] = FA_dataset.read_dataset(N, n, m, sigma2, mu)
    ncomps_aic_bic = [[], [], [], []]
    for n_components in range(1, 8):
        fa = FactorAnalysis(n_components=n_components,
                            tol=0.0001,
                            max_iter=1000)
        fa.fit(X)
        gc = GaussianPdf(fa.mean_, fa.get_covariance())
        # fa.loglike_[-1] == gc.sum_log_pdf(X)
        loglikelihood = gc.sum_log_pdf(X)
        d = n * n_components + X.shape[1] * 2
        ncomps_aic_bic[0].append(n_components)
        ncomps_aic_bic[1].append(loglikelihood)
        ncomps_aic_bic[2].append(loglikelihood - d)
        ncomps_aic_bic[3].append(loglikelihood - 0.5 * d * np.log(N))
        print(n_components, loglikelihood - d,
              loglikelihood - 0.5 * d * np.log(N))
    plt_show_result(ncomps_aic_bic)
Exemplo n.º 6
0
def test_factor_analysis():
    # Test FactorAnalysis ability to recover the data covariance structure
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    assert_raises(ValueError, FactorAnalysis, svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    assert_raises(ValueError, fa_fail.fit, X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert_equal(X_t.shape, (n_samples, n_components))

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert_greater(diff, 0., 'Log likelihood dif not increase')

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert_less(diff, 0.1, "Mean absolute difference is %f" % diff)
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        assert_raises(ValueError, fa.fit, X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))

    fa1.max_iter = 1
    fa1.verbose = True
    assert_warns(ConvergenceWarning, fa1.fit, X)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision),
                                  np.eye(X.shape[1]), 12)
def test_factor_analysis():
    # Test FactorAnalysis ability to recover the data covariance structure
    rng = np.random.RandomState(0)
    n_samples, n_features, n_components = 20, 5, 3

    # Some random settings for the generative model
    W = rng.randn(n_components, n_features)
    # latent variable of dim 3, 20 of it
    h = rng.randn(n_samples, n_components)
    # using gamma to model different noise variance
    # per component
    noise = rng.gamma(1, size=n_features) * rng.randn(n_samples, n_features)

    # generate observations
    # wlog, mean is 0
    X = np.dot(h, W) + noise

    with pytest.raises(ValueError):
        FactorAnalysis(svd_method='foo')
    fa_fail = FactorAnalysis()
    fa_fail.svd_method = 'foo'
    with pytest.raises(ValueError):
        fa_fail.fit(X)
    fas = []
    for method in ['randomized', 'lapack']:
        fa = FactorAnalysis(n_components=n_components, svd_method=method)
        fa.fit(X)
        fas.append(fa)

        X_t = fa.transform(X)
        assert X_t.shape == (n_samples, n_components)

        assert_almost_equal(fa.loglike_[-1], fa.score_samples(X).sum())
        assert_almost_equal(fa.score_samples(X).mean(), fa.score(X))

        diff = np.all(np.diff(fa.loglike_))
        assert diff > 0., 'Log likelihood dif not increase'

        # Sample Covariance
        scov = np.cov(X, rowvar=0., bias=1.)

        # Model Covariance
        mcov = fa.get_covariance()
        diff = np.sum(np.abs(scov - mcov)) / W.size
        assert diff < 0.1, "Mean absolute difference is %f" % diff
        fa = FactorAnalysis(n_components=n_components,
                            noise_variance_init=np.ones(n_features))
        with pytest.raises(ValueError):
            fa.fit(X[:, :2])

    f = lambda x, y: np.abs(getattr(x, y))  # sign will not be equal
    fa1, fa2 = fas
    for attr in ['loglike_', 'components_', 'noise_variance_']:
        assert_almost_equal(f(fa1, attr), f(fa2, attr))

    fa1.max_iter = 1
    fa1.verbose = True
    assert_warns(ConvergenceWarning, fa1.fit, X)

    # Test get_covariance and get_precision with n_components == n_features
    # with n_components < n_features and with n_components == 0
    for n_components in [0, 2, X.shape[1]]:
        fa.n_components = n_components
        fa.fit(X)
        cov = fa.get_covariance()
        precision = fa.get_precision()
        assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]),
                                  12)

    # test rotation
    n_components = 2

    results, projections = {}, {}
    for method in (None, "varimax", 'quartimax'):
        fa_var = FactorAnalysis(n_components=n_components, rotation=method)
        results[method] = fa_var.fit_transform(X)
        projections[method] = fa_var.get_covariance()
    for rot1, rot2 in combinations([None, 'varimax', 'quartimax'], 2):
        assert not np.allclose(results[rot1], results[rot2])
        assert np.allclose(projections[rot1], projections[rot2], atol=3)

    assert_raises(ValueError,
                  FactorAnalysis(rotation='not_implemented').fit_transform, X)

    # test against R's psych::principal with rotate="varimax"
    # (i.e., the values below stem from rotating the components in R)
    # R's factor analysis returns quite different values; therefore, we only
    # test the rotation itself
    factors = np.array([[0.89421016, -0.35854928, -0.27770122, 0.03773647],
                        [-0.45081822, -0.89132754, 0.0932195, -0.01787973],
                        [0.99500666, -0.02031465, 0.05426497, -0.11539407],
                        [0.96822861, -0.06299656, 0.24411001, 0.07540887]])
    r_solution = np.array([[0.962, 0.052], [-0.141, 0.989], [0.949, -0.300],
                           [0.937, -0.251]])
    rotated = _ortho_rotation(factors[:, :n_components], method='varimax').T
    assert_array_almost_equal(np.abs(rotated), np.abs(r_solution), decimal=3)
Exemplo n.º 8
0
class FA(object):  # 因子分解
    def __init__(self,
                 n_components=None,
                 tol=1e-2,
                 copy=True,
                 max_iter=1000,
                 noise_variance_init=None,
                 svd_method='randomized',
                 iterated_power=3,
                 random_state=0):
        """
        :param n_components:   int 想要的分量个数, 默认为None,表示全部需要
        :param tol: float  停止对数斯然估计的增加的容忍度
        :param copy:  bool   False时,在fit阶段,数据会被覆盖
        :param max_iter: # 最大迭代次数
        :param noise_variance_init:   # None | array, shape=(n_features,)  每个特征的噪声方差的初始化猜测,
        如果为None,  默认为np.ones(n_features)
        :param svd_method:  {"lapack","randomized"}, "lapack", 使用标注的svd, "randomized"  使用快速的随机svd
        :param iterated_power: int, 可选项。 默认为3, 幂方法的迭代次数
        :param random_state: 随机种子
        """
        self.model = FactorAnalysis(n_components=n_components,
                                    tol=tol,
                                    copy=copy,
                                    max_iter=max_iter,
                                    noise_variance_init=noise_variance_init,
                                    svd_method=svd_method,
                                    iterated_power=iterated_power,
                                    random_state=random_state)

    def fit(self, x, y=None):
        return self.model.fit(X=x, y=y)

    def transform(self, x):
        self.model.transform(X=x)

    def fit_transform(self, x, y=None):
        return self.model.fit_transform(X=x, y=y)

    def get_covariance(self):
        return self.model.get_covariance()

    def get_params(self, deep):
        return self.model.get_params(deep=deep)

    def set_params(self, **params):
        self.model.set_params(**params)

    def get_precision(self):  # 用因子分解模型生成 精度矩阵
        return self.model.get_precision()

    def score(self, x, y=None):
        return self.model.score(X=x, y=y)

    def score_sample(self, x):
        return self.model.score_samples(X=x)

    def get_attributes(self):
        component = self.model.components_  # 分量值
        loglike = self.model.loglike_  # 对数似然
        noise_var = self.model.noise_variance_  # 每个特征的评估噪声方差 arry  shape(n_features,)
        n_iter = self.model.n_iter_  # int, 运行的迭代次数
        mean = self.model.mean_  # array,shape(n_features,) 训练集中评估的特征均值

        return component, loglike, noise_var, n_iter, mean
def main():
    # load data from http://www.ats.ucla.edu/stat/spss/output/principal_components_files/M255.SAV
    df = Sav2Df('M255.SAV')
    hmap = getHeaderDict('M255.SAV')

    # keep columns as stipulated in original article 
    keep_cols = ['item13', 'item14', 'item15', 'item16', 'item17', 'item18', 
                 'item19', 'item20', 'item21', 'item22', 'item23', 'item24']

    dfX = df[keep_cols].rename(columns=hmap)#astype(float32)
    X = dfX.as_matrix()

    univariate_table = get_univariate_table(dfX) #descriptive statistics

    eig_init_perc_df = get_initial_percent_explained_variance(dfX)


    ### Decide optimal n_components based on cross validation
    # adapted from http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_vs_fa_model_selection.html

    n_components = np.arange(X.shape[1])#[4,6,8,10,12,14,16] #range(12)
    n_components_fa = find_fa_n_components_by_crossval(X, n_components)

    ##here drop was after 4 components. in original example 3 is deemed optimum

    # with n selected, fit a FactorAnalysis model
    n_components_fa = 3#4
    fa = FactorAnalysis(n_components_fa, random_state=101)
    factor = fa.fit(X)

    covar = fa.get_covariance()
    
    # print global_kmo(covar) #results not yet validated

    eig_extraction_perc_df = get_factor_extraction_percent_explained_variance(covar)
    eig_retain_df = eig_extraction_perc_df.iloc[:n_components_fa]

    loads = pd.DataFrame(fa.components_,columns=macro_cols)#cols)
    # loads = loads.rename(columns=header_en)
    cols = loads.columns
    # loads.to_csv('loads_matrix_10_components.csv', index=False, encoding='utf-8')
    # OR just load
    # loads = pd.read_csv('loads_matrix_10_components.csv', encoding='utf-8')

    ## Write each output table to a sheet in an excel file

    writer = pd.ExcelWriter('factor_analysis_{}_components_varimax.xlsx'.format(n_components_fa),
                         engine='xlsxwriter', options={'encoding':'utf-8'})


    cutoff = 0.3#.3
    for i in xrange(len(loads)):
        s = loads.iloc[i].sort_values( ascending=False).reset_index()
        s = s[(s[i]>cutoff) | (s[i]<-cutoff)]
        # print s
        s.rename(columns={'index':'component'}).to_excel(writer, 'Postive Sorted Loads', encoding='utf-8', startcol=3*i, index=False)

    for i in xrange(len(loads)):
        s = loads.iloc[i].sort_values( ascending=False).reset_index()
        s.rename(columns={'index':'component'}).to_excel(writer, 'Loads', encoding='utf-8', startcol=3*i, index=False)

    loads.rename(columns=header_en).to_excel(writer, 'Raw Components', encoding='utf-8' )

    # repeat for rotated loads
    loads = pd.DataFrame(varimax(fa.components_),columns=macro_cols)
    cutoff = 0.3#.3
    for i in xrange(len(loads)):
        s = loads.iloc[i].sort_values( ascending=False).reset_index()
        s = s[(s[i]>cutoff) | (s[i]<-cutoff)]
        # print s
        s.rename(columns={'index':'component'}).to_excel(writer, 'Varimax Rot. Pos. Sorted Loads', encoding='utf-8', startcol=3*i, index=False)

    for i in xrange(len(loads)):
        s = loads.iloc[i].sort_values( ascending=False).reset_index()
        s.rename(columns={'index':'component'}).to_excel(writer, 'Varimax Rot. Loads', encoding='utf-8', startcol=3*i, index=False)

    loads.rename(columns=header_en).to_excel(writer, 'Varimax Raw Components', encoding='utf-8' )

    ### explained variances and noise

    # levels = [['Initial Eigenvalues', 'Extraction'],
    #           ['Total', '\% \of Variance', 'Cumulative \%']]
    # eig_df = pd.DataFrame(columns=pd.MANUALLY_DIGESTED)

    eig_init_perc_df.to_excel(writer, 'Eigenvals (perc. explained var)',  encoding='utf-8')
    eig_retain_df.to_excel(writer, 'Eigenvals (perc. explained var)',startcol=3,  encoding='utf-8')


    noise_var = pd.DataFrame(fa.noise_variance_, index=macro_cols)#.rename(index=header_en)
    noise_var.to_excel(writer, 'Noise Variance', encoding='utf-8')

    covar_df = pd.DataFrame(covar_extraction, index=cols, columns=cols
                            ).rename(columns=header_en, index=header_en)
    covar_df.to_excel(writer, 'Covariance Matrix', encoding='utf-8')

    corr_init_df.to_excel(writer, 'Correlation Mat(Data)', encoding='utf-8')

    corr_extraction_df = pd.DataFrame(corr_extraction, index=macro_cols, columns=macro_cols
                            )#.rename(columns=header_en, index=header_en)
    corr_extraction_df.to_excel(writer, 'Corr Mat of Factor Loads ', encoding='utf-8')

    precision_mat = pd.DataFrame(fa.get_precision(), index=cols, columns=cols).rename(columns=header_en)
    precision_mat.to_excel(writer, 'Precision Matrix', encoding='utf-8')


    writer.save()
Exemplo n.º 10
0

# In[27]:


# Step 3. Perform factor analysis

# Choose the number of factors
transformer = FactorAnalysis( n_components=4 )
X_transformed = transformer.fit_transform(fifa_num) 

display(X_transformed.shape, fifa_num.shape)

plt.figure( figsize=(10, 10) )
sns.heatmap(
    transformer.get_covariance(),
    annot=True
)
plt.autoscale()
plt.show()


# In[ ]:


##########################################################################


# In[ ]:

Exemplo n.º 11
0
df2 = pd.DataFrame(x_scaled)
print df2
'''
for col in orig:
    orig[col] = (orig[col] - orig[col].mean()) / orig[col].std()
orig = orig.fillna(0)

#print orig
print orig.columns
X = orig.values
n_rows, n_cols = X.shape
y = orig.columns
pca = PCA().fit(X)
print 'Explained variance by component: %s' % pca.explained_variance_ratio_
print len(pca.components_)
'''
binner = Bin(bin_start=1, axis=0)
binned_matrix = binner.fit_transform(X)
shuffle_indices = get_shuffle_indices(X.shape[0])


shuffled_matrix = binned_matrix[shuffle_indices, :]

'''
print "FA STARTS.........", X.shape
fa_model = FactorAnalysis().fit(X)
print fa_model.get_covariance().shape
#df_plot = pd.DataFrame(fa_model.components_, columns=y)
#df_plot.plot.scatter(s= df_plot['timestamp'])
pd.plotting.scatter_matrix(orig, alpha=0.3, figsize=(14, 8), diagonal='kde')