Пример #1
0
    def runFactorAnalyzer(self, cols_to_norm, result):
        fa = FactorAnalyzer(rotation="varimax", n_factors=2)
        df = result[cols_to_norm]
        result = result.dropna()
        df = df.dropna()

        fa.fit(df)
        ev = fa.get_eigenvalues()
        kmo_all, kmo_model = calculate_kmo(df)

        if (kmo_model < 0.6):
            print("kmo_model: %s " % kmo_model)
        array = fa.transform(df)
        #print("Factors: %s" % (array))
        #print("loadings: %s " % fa.loadings_)

        #print("eigenvalues: %s " % ev[0])
        dataframe = pd.DataFrame(columns=[
            'Player', 'Session', 'Time', 'NegativeEmotion', 'PositiveEmotion'
        ])
        print("T session: %s " % len(result['Session']))
        dataframe['Session'] = result['Session']
        dataframe['Player'] = result['Player']
        dataframe['Time'] = result['ts']
        dataframe['NegativeEmotion'] = np.around(array[:, 0], 2)
        dataframe['PositiveEmotion'] = np.around(array[:, 1], 2)
        dataframe.to_csv('/home/elton/Desktop/Dataset/MetricsEmotion.csv',
                         sep=',',
                         mode='a',
                         header=False)
Пример #2
0
def get_fa_loads(d_phens,
                 kmo_threshold=0.6,
                 bartlett_threshold=0.05,
                 n_shuffle=100,
                 test_factorability=False):
    """
    Get factors
    :param d_phens:
    :param loading_thresh:
    :param kmo_threshold:
    :param bartlett_threshold:
    :param n_shuffle:
    :param test_factorability:
    :return:
    """

    # Evaluation of the “factorability” of phenotypes
    if test_factorability:
        _, bartlett_value = calculate_bartlett_sphericity(d_phens)
        _, kmo_model = calculate_kmo(d_phens)
        if (kmo_model < kmo_threshold) or (bartlett_value > bartlett_threshold):
            # raise ValueError('Phenotypic data does not contain factors')
            warnings.warn('\nPhenotypic data does not contain factors')
            return None

    # Define the number of afctors by parallel analysis
    n_factors = pa(d_phens, n_shuffle)

    # factor analysis
    fa = FactorAnalyzer(n_factors=n_factors)
    fa.fit(d_phens)

    loads = pd.DataFrame(data=fa.loadings_, index=d_phens.columns)

    return loads
def can_use_factor_analysis(df):
    #   ignore bartlett for the time being but is there in case I want it.
    # chi_square_value, p_value = calculate_bartlett_sphericity(df)
    # print(chi_square_value, p_value)

    a = calculate_kmo(df)
    if a[1] < .6:
        return False
    print("kmo = " + str(a))
    return True
Пример #4
0
def perform_fa_KMO(df):
    kmo_all, kmo_model = calculate_kmo(df)
    if kmo_model < 0.6:
        print(
            f"KMO=({kmo_model}). Proportion of variance NOT suitable for factor analysis"
        )
    else:
        print(
            f"KMO=({kmo_model}). Proportion of variance suitable for factor analysis"
        )
Пример #5
0
def stat_test(mat_features):
    m_corr = np.array(mat_features.corr())
    print('determinant feature matrix is: ', np.linalg.det(m_corr))

    chi_square_value, p_value = calculate_bartlett_sphericity(m_corr)
    print('Bartlett test')
    print('value of chi square: ', chi_square_value, 'p-value: ', p_value)

    kmo_all, kmo_model = calculate_kmo(m_corr)
    print('Kaiser-Meyer-Olkin test')
    print('value of kmo: ', kmo_model)
def out_Ade(df2):
    #充分性测试(Adequacy Test)
    #KMO值:0.9以上非常好;0.8以上好;0.7一般;0.6差;0.5很差;0.5以下不能接受;巴特利球形检验的值范围在0-1,越接近1,使用因子分析效果越好
    #Kaiser-Meyer-Olkin Test
    kmo_all, kmo_model = factor_analyzer.calculate_kmo(df2)
    print('kmo:{}'.format(kmo_model))
    #Bartlett's Test
    chi_square_value, p_value = factor_analyzer.calculate_bartlett_sphericity(
        df2)
    print('Bartlett_p:{}'.format(p_value))
    out_Ade_df = pd.DataFrame([['充分性测试(Adequacy Test)', None],
                               ['kmo', 'Bartlett_p'], [kmo_model, p_value]])
    #    print(out_Ade_df)
    return out_Ade_df
Пример #7
0
def factor_analysis(df, name, all_factors=True):
    chi_square_value, p_value = calculate_bartlett_sphericity(df)
    kmo_all, kmo_model = calculate_kmo(df)
    pairwise_correlations = pairwiseCorr(df)
    print("\n" + "Chi_square_value and p_value:")
    print(chi_square_value, p_value)
    print("\n" + "Kmo model:")
    print(kmo_model)
    print("\n" + "Pairwise correlations:")
    print(pairwise_correlations)
    if all_factors == True:
        eigenvalues = get_factor_eigenvalues(df)
        print("\n" + "FA Eigenvalues:")
        print(eigenvalues)
    plot_correlations(df)
Пример #8
0
 def KMO_test(self):
     kmo_all, kmo_model = calculate_kmo(self.dataset)
     if kmo_model >= 0.5:
         kmo_rslt = "marvelous"
     elif kmo_model >= 0.6:
         kmo_rslt = "mediocre"
     elif kmo_model >= 0.7:
         kmo_rslt = "middling"
     elif kmo_model >= 0.8:
         kmo_rslt = "meritorious"
     elif kmo_model >= 0.9:
         kmo_rslt = "marvelous"
     else:
         kmo_rslt = "unacceptable"
     self.logger.info("KMO measure = %.3f (%s)" % (kmo_model, kmo_rslt))
Пример #9
0
def efa_model_tests(item_data):
    st.header('Model Tests')
    col1, col2 = st.beta_columns(2)

    with col1:
        st.write('Bartlett Sphericity')
        #runSphericity = st.button('Run Analysis')
        #if runSphericity:
        chi2, pValue = calculate_bartlett_sphericity(item_data)
        st.write(f'chi-squared = {chi2}, p = {pValue}')
    with col2:
        st.write('KMO Test')
        #runKmo = st.button('Run KMO')
        #if runKmo:
        kmoAll, kmoModel = calculate_kmo(item_data)
        st.write(f'KMO Statistic: {kmoModel}')
Пример #10
0
def factor_analysis(df, name, not_test_set=True):
    chi_square_value, p_value = calculate_bartlett_sphericity(df)
    kmo_all, kmo_model = calculate_kmo(df)
    p_values, pairwise_correlations = pairwiseCorr(df)
    print("\n" + "Chi_square_value and p_value:")
    print(chi_square_value, p_value)
    print("\n" + "Kmo model:")
    print(kmo_model)
    print("\n" + "Statistically significant correlations:")
    print(p_values)
    print("\n" + "Most correlated variables:")
    print(pairwise_correlations)
    if not_test_set == True:
        eigenvalues = get_factor_eigenvalues(df)
        print("\n" + "FA Eigenvalues:")
        print(eigenvalues)
    plot_correlations(df, name)
Пример #11
0
def test_calculate_kmo():

    path = 'tests/data/test02.csv'
    data = pd.read_csv(path)

    expected_overall = 0.81498469767761361

    values = [
        0.405516, 0.560049, 0.700033, 0.705446, 0.829063, 0.848425, 0.863502,
        0.841143, 0.877076, 0.839272
    ]

    expected_by_item = np.array(values)

    (kmo_by_item, kmo_overall) = calculate_kmo(data.values)

    assert_almost_equal(kmo_by_item, expected_by_item)
    assert_almost_equal(kmo_overall, expected_overall)
Пример #12
0
def FA(observied_variables, name):
    from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
    chi_square_value, p_value = calculate_bartlett_sphericity(
        observied_variables)
    print("chi_square_value", chi_square_value, "p-value:", p_value)
    from factor_analyzer.factor_analyzer import calculate_kmo
    kmo_all, kmo_model = calculate_kmo(observied_variables)
    print("KMO value", kmo_model)

    # Create factor analysis object and perform factor analysis
    if name == 'phone':
        fa = FactorAnalyzer(n_factors=2)
    if name == 'QOL':
        fa = FactorAnalyzer(n_factors=4)
    fa.fit_transform(observied_variables)
    # Check Eigenvalues
    eigen_values, vectors = fa.get_eigenvalues()
    print(eigen_values)
    """
    # Create scree plot using matplotlib
    plt.scatter(range(1,observied_variables.shape[1]+1),eigen_values)
    plt.plot(range(1,observied_variables.shape[1]+1),eigen_values)
    if name == 'phone':
        plt.title('Scree Plot for phone features',fontsize=24)
    if name == 'QOL':
        plt.title('Scree Plot for QOL features',fontsize=24)
    plt.xlabel('Factors', fontsize=18)
    plt.ylabel('Eigenvalue',fontsize=18)
    plt.grid()
    plt.show()
    """

    loadings = fa.loadings_
    print(pd.DataFrame(loadings, observied_variables.columns))
    #print(pd.DataFrame(fa.get_communalities()))
    return pd.DataFrame(loadings, observied_variables.columns)

    # Get variance of each factors
    print(
        pd.DataFrame(fa.get_factor_variance(),
                     ['SS Loadings', 'Proportion Var', 'Cumulative Var']))
Пример #13
0
def test_calculate_kmo():

    path = 'tests/data/test02.csv'
    data = pd.read_csv(path)

    expected_overall = 0.81498469767761361

    index = [
        'sex', 'zygosity', 'moed', 'faed', 'faminc', 'english', 'math',
        'socsci', 'natsci', 'vocab'
    ]

    values = [
        0.405516, 0.560049, 0.700033, 0.705446, 0.829063, 0.848425, 0.863502,
        0.841143, 0.877076, 0.839272
    ]

    expected_by_item = pd.DataFrame(values, columns=['KMO'], index=index)

    (kmo_by_item, kmo_overall) = calculate_kmo(data)

    assert_almost_equal(kmo_by_item, expected_by_item)
    assert_almost_equal(kmo_overall, expected_overall)
def run_sampling_adequacy_app():

    st.header('■Measure of Sampling Adequacy')
    st.write(
        'To investigate the adequay of the number of samples for questionnaire.Kaiser-Meyer-Olkin (KMO) Test is used. '
    )
    st.write('KMO values between 0.8 and 1 indicate the sampling is adequate.')
    st.write(
        'KMO values less than 0.6 indicate the sampling is not adequate and that remedial action should be taken. '
    )
    st.write(
        'Some authors put this value at 0.5, so use your own judgment for values between 0.5 and 0.6.'
    )

    st.sidebar.subheader('Data upload')
    df_edu = pd.read_csv("data/eng_sample_data_sampling.csv")

    def download_link(object_to_download, download_filename,
                      download_link_text):
        if isinstance(object_to_download, pd.DataFrame):
            object_to_download = object_to_download.to_csv(
                index=False, encoding='utf_8_sig')
            b64 = base64.b64encode(object_to_download.encode()).decode()
            return f'<a href="data:file/txt;base64,{b64}" download="{download_filename}">{download_link_text}</a>'

    tmp_download_link = download_link(df_edu, 'sample_sampling.csv',
                                      'Download sample csv file.')
    st.sidebar.markdown(tmp_download_link, unsafe_allow_html=True)

    try:

        uploaded_file = st.sidebar.file_uploader(
            "File upload (Drag and drop or use [Browse files] button to import csv file. Only utf-8 format is available.)",
            type=["csv"])
        # uploaded_file = st.file_uploader(
        #     label = 'File Upload(Drag and drop csv/Excel file)',
        #     type = ['csv', 'xlsx']
        # )
        if uploaded_file is not None:
            df_edu = pd.read_csv(uploaded_file)
            uploaded_file.seek(0)
            display_data = st.sidebar.checkbox(label='Show uploaded data')

            if display_data:
                st.dataframe(df_edu)

        else:
            df_edu = pd.read_csv('data/eng_sample_data_sampling.csv')

            show_df = st.sidebar.checkbox('Show DataFrame')

            if show_df == True:
                st.write(df_edu)

        df_edu = df_edu.dropna()
        df_edu = df_edu.drop(['student'], axis=1)
        from factor_analyzer.factor_analyzer import calculate_kmo
        kmo_all, kmo_model = calculate_kmo(df_edu)
        st.write('## KMO value:', kmo_model.round(2))

        st.subheader('Data overview (correlation coefficient)')
        st.write(df_edu.corr().style.background_gradient(cmap='coolwarm'))

        fa = FactorAnalyzer()
        fa.fit(df_edu)
        ev, v = fa.get_eigenvalues()

        st.set_option('deprecation.showPyplotGlobalUse', False)
        plt.figure(figsize=(7, 5))
        plt.scatter(range(1, df_edu.shape[1] + 1), ev)
        plt.plot(range(1, df_edu.shape[1] + 1), ev)
        plt.title('Scree Plot')
        plt.xlabel('Factors')
        plt.ylabel('Eigenvalue')
        plt.grid()
        st.pyplot()

        fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop')
        fa.fit(df_edu)
        df_result = pd.DataFrame(fa.loadings_, columns=['1st', '2nd', '3rd'])
        df_result.index = df_edu.columns
        cm = sns.light_palette('blue', as_cmap=True)
        df_factor = df_result.style.background_gradient(cmap=cm)
        st.write(df_factor)

    except Exception as e:
        st.header(
            'ERROR: Data inconsistency. Check data format to be uploaded.')
        print('Data inconsistency error')
Пример #15
0
# import dataset
X = df.drop('qc_salzrckhalt', axis=1)
y = df['qc_salzrckhalt']

# Adequcy Test :  need to evaluate the “factorability” of our dataset.
# Factorability means "can we found the factors in the dataset?"

# Bartletss`s Test
VarbList = df.columns
chi_square_value, p_value = calculate_bartlett_sphericity(X)
chi_square_value, p_value
# --> p Value = 0 that mean the test was statistically significant, the obvserved correlation matrix is not an identy matrix

# Kaiser_Meyer_Olkin Test
kmo_all, kmo_model = calculate_kmo(X)
kmo_model
# --> KMO value of 0.653 indicates a moderate suitableity for factory analysis  ' Source Cureton, E. E./ D'Agostino, R. B. 1983: Factor analysis: an applied approach. Hillside, NJ: Lawrence Erlbaum Associates, S. 389 f.

# Choosing Number of Factors
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(rotation=None, n_factors=30)
fa.fit(X)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
# --> only 30 Eigenvalues greater than 1 , so only choose them ?

# Create scree plot
g = plt.scatter(range(1, X.shape[1] + 1), ev)
#g = plt.plot(range(1,X.shape[1]+1),ev)
#use Barlett_sphericity Test
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value,p_value=calculate_bartlett_sphericity(data)
chi_square_value,p_value


# our P_value=0 is less than 0.05. Therefore the test was statistically significant, indicating that the observed correlation matrix is not an identity matrix and Factor analysis can be conducted.
# 

# In[7]:


#Determine Factorability
#perform KMD test
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all,kmo_model=calculate_kmo(data)
print('\nKMO Model',kmo_model)


# The overall KMO for our data is 0.73, which is meritorious 
# This value indicates that we can proceed with your planned factor analysis.
# 

# # Step 2: Extract  Factors!

# Principal  Component Analysis (PCA)  involves  compressing  and reducing all the relationships in the data into  newly created features called components.  Each of these components represents  a factor

# In[17]:


#Extract the factors
# Bartlett’s test of sphericity checks whether or not the observed variables intercorrelate at all using the observed correlation matrix against the identity matrix. If the test found statistically insignificant, you should not employ a factor analysis
#

# In[36]:

chi_square_value, p_value = calculate_bartlett_sphericity(card_df_norm)
print(chi_square_value, p_value)

# ##### In this Bartlett ’s test, the p-value is 0. The test was statistically significant, indicating that the observed correlation matrix is not an identity matrix.
#

# Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis. It determines the adequacy for each observed variable and for the complete model. KMO estimates the proportion of variance among all the observed variable. Lower proportion id more suitable for factor analysis. KMO values range between 0 and 1. Value of KMO less than 0.6 is considered inadequate.

# In[37]:

kmo_all, kmo_model = calculate_kmo(card_df_norm)
kmo_model

# ###### Here kmo_model value is 0.64, so it is adequate.

# ### Choosing the Number of Factors

# In[38]:

fa = FactorAnalyzer()
fa.set_params(n_factors=25, rotation=None)
fa.fit(card_df_norm)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
ev
Пример #18
0
 def test(self):
     self.chi_square_value, self.p_value = calculate_bartlett_sphericity(
         self.data[self.col])
     self.kmo_all, self.kmo_model = calculate_kmo(self.data[self.col])
     return self.chi_square_value, self.p_value, self.kmo_all, self.kmo_model
Пример #19
0
# for these plots, and calculations of factor scores.
fdata0[mhvars] *= -1

# Fit the factor model, no rotation at first.
fa1 = FactorAnalyzer(rotation=None).fit(fdata0)

# Determine number of factors, given eigenvalues > 1.0
nf = (fa1.get_eigenvalues()[1] > 1).sum()
fnames = [f"F{i+1}" for i in range(nf)]  # name them F1 ... FN for now

# Bartlett; p-value should be 0 (statistically sig.)
chi_square_value, p_value = calculate_bartlett_sphericity(fdata0)
print(f"chi2 = {chi_square_value:.02f}, p = {p_value:.04f}")

# KMO; Value should be > 0.6
kmo_all, kmo_model = calculate_kmo(fdata0)
print(f"KMO = {kmo_model:.02f}")

# Now do the factor analysis retaining only the the number of factors,
# and do a rotation (Varimax)
rotation = "varimax"
fa = FactorAnalyzer(nf, rotation=rotation)
fa = fa.fit(fdata0)

# Calcualte Factor scores using the fitted factor model
# - one new column per factor score (named F1 ... FN)
# - rescale to have SD = 1.0
Zcc[fnames] = np.nan
Zcc.loc[fdata0.index,
        fnames] = (StandardScaler().fit_transform(fa.transform(fdata0)))
Пример #20
0
# Factor Analysis
#--------------------------------------------
'''NOTE: from the previous PCA we know that the first three principle components
    explain 72% of the variance. Hence we base our analysis on three components.
    
    The standard method of the FA package is minimum residual. Also possible:
        MLE, PCA. 
    '''

# Pre-tests
## Bartlett's test. H0: equal variance
bartlett_chi, bartlett_p = calculate_bartlett_sphericity(
    df[vars_tot])  # p = 0.0

## Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.6
kmo_all, kmo_model = calculate_kmo(df[vars_tot])  #kmo_model = 0.7297

#--------------------------------------------
# Factor Analysis
fa = FactorAnalyzer(rotation=None, n_factors=4)
fa.fit(df[vars_tot])
ev, v = fa.get_eigenvalues()
'''NOTE: First four factors have an eigen value greater than 1. Use those.'''

# Perform a parallel analysis
list_ev_rand = []

np.random.seed(10)
for i in range(100):
    df_rand = pd.DataFrame(np.random.rand(*df[vars_tot].shape))
    fa_rand = FactorAnalyzer(rotation=None, n_factors=4).fit(df_rand)
Пример #21
0
# calculate the number of variables
numVars = resultE.shape[1]
print(numVars)  # = 14 for these data files

# # Bartlett's test
# # Is this data statistically significant?
# # A p-value < 0.05 means that data is statistically significant
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
chi_square_value, p_value = calculate_bartlett_sphericity(resultE)
print(chi_square_value, p_value)
#
# # Kaiser-Meyer-Olkin test
# # Measures suitability of the data for factor analysis
# # 0 <= KMO <= 1, where KMO <= 0.6 is not adequate for factor analysis
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(resultE)
print(kmo_model)
#
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer()
fa.fit(resultE)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
print(
    ev
)  # the number of eigenvalues >= 1 is the max number of factors you need to potentially consider

# Create scree plot using matplotlib
# look at the difference between eigenvalues - we can see that after a certain point, they are super similar
plt.scatter(range(1, resultE.shape[1] + 1), ev)
plt.plot(range(1, resultE.shape[1] + 1), ev)
Пример #22
0
# KMO=[0.8; 1] indicate the sampling is adequate.
# KMO=[0.5; 0.6] indicate the sampling is not adequate and that remedial
# action should be taken.
# KMO -> 0 means that there are large partial correlations compared to the
# sum of correlations.
# In other words, there are widespread correlations which are a large problem
# for factor analysis.

# 0.00 to 0.49 unacceptable.
# 0.50 to 0.59 miserable.
# 0.60 to 0.69 mediocre.
# 0.70 to 0.79 middling.
# 0.80 to 0.89 meritorious.
# 0.90 to 1.00 marvelous

kmo_all, kmo_model = calculate_kmo( fifa_num )

display( kmo_model )  # kmo_model = 0.5555331829032943 -> miserable == inadequate data

col_msa_value_df = pd.DataFrame({
    'col_name': fifa_num.columns,
    'MSA_value': kmo_all  # measure of sampling adequacy
})

display(col_msa_value_df)


# In[8]:


# MSA < 0.5 - variable should be dropped
#--------------------------------------------
''' We perform two pre-tests
    1) Bartlett's test of sphericity: tests whether the correlation matrix 
        equals an identiy matrix (H0), which means that the variables are 
        unrelated;
    2) Kaiser-Meyer-Olkin test: Is a statistic that indicates the proportion
        of variance that might be caused by underlying factors. Test statistic
        should be over .5'
    '''

# Bartlett's test. H0: equal variance
bartlett_chi, bartlett_p = calculate_bartlett_sphericity(
    df_standard)  # p = 0.0

# Kaiser-Meyer-Olkin (KMO) test. Measures data suitability; should be between 0 and 1, but above 0.5
kmo_all, kmo_model = calculate_kmo(df_standard)  #kmo_model = 0.85
'''Note: looks good '''

#--------------------------------------------
# Determine number of factors
#--------------------------------------------
'''We use multiple selection criteria:
    1) Scree plot (elbow plot)
    2) Kaiser-Guttman rule
    3) Parallel analysis
    '''

# Get factor estimates
fa = FactorAnalyzer(rotation=None, method='principal')
fa.fit(df_standard)
ev, v = fa.get_eigenvalues()