예제 #1
0
def factors_lst(number_factors, lst_obs, prnt):
    """
    Does the Factor anlaysing/dimensionality reduction for a list of observations with a loop ober that list
    :param number_factors:  the number of factors to be taken (the reduced dimensionality), has to be the same for
                            all the list elements (integer)
    :param lst_obs:         list,  elements hold the dF/F for the Mouse/sessions/trial type of one specific trial
    :return:                list, elements hold the transformed observations. the shape of the elements is now
                            (time steps, number of factors)
    """

    lst_obs_transformed = []

    for item in lst_obs:

        fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                            method='minres', n_factors=number_factors, rotation=None, rotation_kwargs={},
                            use_smc=True)

        fa.fit(item)
        obs_transformed = fa.transform(item)

        lst_obs_transformed.append(obs_transformed)

    if prnt == True:

        print()
        print('shape of one element of the list')
        print('after the dim reduction:                  ', np.shape(lst_obs_transformed[0]))
        print('number of time steps:                     ', lst_obs_transformed[0].shape[0])
        print('number of dimensions:                     ', lst_obs_transformed[0].shape[1])

    return lst_obs_transformed
예제 #2
0
    def factor_analysis(self, input_x):
        ss_x = StandardScaler().fit_transform(input_x)
        norm_x = normalize(input_x, axis=0)
        factor_number = 9
        fa = FactorAnalyzer(
            n_factors=factor_number,
            rotation='oblimin')  # oblimin/promax varimax:orthogonal
        fa.fit(ss_x)
        ev, v = fa.get_eigenvalues()
        factor_loading_matrix = fa.loadings_
        fa_score = fa.transform(ss_x)
        print('ev', ev)
        # print('v',v)
        # print('factor_loading_matrix',factor_loading_matrix)
        fa_name = list(self.table_data.columns[1::])
        # print('quantization_score', len(fa_name),fa_name)
        for i in range(factor_number):
            all_coefficients = np.sort(factor_loading_matrix[:, i])
            coefficients_index = np.argsort(factor_loading_matrix[:, i])
            print('factor_i', i)
            for j, coefficient in enumerate(all_coefficients):
                if coefficient > 0.5:
                    print('coefficients_index', coefficients_index[j],
                          fa_name[coefficients_index[j]])

        plt.scatter(range(1, input_x.shape[1] + 1), ev)
        plt.plot(range(1, input_x.shape[1] + 1), ev)
        plt.title('scree figure')
        plt.ylabel('eigenvalues')
        plt.grid()
        plt.show()

        return fa_score
예제 #3
0
def get_MFA_params(zl, kl, rl_nextl):
    ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster
    zl (ndarray): The lth layer latent variable 
    kl (int): The number of components of the lth layer
    rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer
    -----------------------------------------------------
    returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. 
    '''
    #======================================================
    # Fit a GMM in the continuous space
    #======================================================
    numobs = zl.shape[0]

    not_all_groups = True
    max_trials = 100
    empty_count_counter = 0

    while not_all_groups:
        # If not enough obs per group then the MFA diverge...

        gmm = GaussianMixture(n_components=kl)
        s = gmm.fit_predict(zl)

        clusters_found, count = np.unique(s, return_counts=True)

        if (len(clusters_found) == kl):  # & (count >= 5).all():
            not_all_groups = False

        empty_count_counter += 1
        if empty_count_counter >= max_trials:
            raise RuntimeError(
                'Could not find a GMM init that presents the \
                               proper number of groups:', kl)

    psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float)
    eta = np.full((kl, rl_nextl[0]), 0).astype(float)
    z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float)

    #========================================================
    # And then a MFA on each of those group
    #========================================================

    for j in range(kl):
        indices = (s == j)
        fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1])
        fa.fit(zl[indices])

        psi[j] = np.diag(fa.get_uniquenesses())
        H[j] = fa.loadings_
        psi_inv[j] = np.diag(1 / fa.get_uniquenesses())
        z_nextl[indices] = fa.transform(zl[indices])

        eta[j] = np.mean(zl[indices], axis=0)

    params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s}
    return params
예제 #4
0
파일: lb_v2.py 프로젝트: ikem55/HRsystem
    def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df,
                                          dict_folder):
        """ RaceUmaの因子分析を行うためのデータを取得 """
        print("factory_analyze_raceuma_result_df")
        temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード")
        X = temp_df[[
            '競走コード', '馬番', '枠番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減',
            '斤量比', '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち",
            "外勝ち", "短縮勝ち", "延長勝ち", "人気勝ち", "1番人気", "3角先頭", "4角先頭", "上がり最速",
            "上がりタイム", "連闘", "休み明け", "大差負け", "展開脚質", "展開脚色"
        ]]

        mmsc_columns = ["頭数", "展開脚質", "展開脚色", "上がりタイム"]
        mmsc_dict_name = "sc_fa_race_mmsc"
        stdsc_columns = ["距離"]
        stdsc_dict_name = "sc_fa_race_stdsc"
        X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns,
                               stdsc_dict_name, dict_folder)

        X_fact = X.drop(["競走コード", "馬番"], axis=1).astype({
            '非根幹': int,
            '逃げ勝ち': int,
            '内勝ち': int,
            '外勝ち': int,
            '短縮勝ち': int,
            '延長勝ち': int,
            '人気勝ち': int,
            '1番人気': int,
            '3角先頭': int,
            '4角先頭': int,
            '上がり最速': int,
            '休み明け': int,
            '連闘': int,
            '大差負け': int
        })

        X_fact = X_fact.replace(np.inf,
                                np.nan).fillna(X_fact.median()).fillna(0)
        X_fact.iloc[0] = X_fact.iloc[0] + 0.000001

        dict_name = "fa_raceuma_result_df"
        filename = dict_folder + dict_name + '.pkl'
        if os.path.exists(filename):
            fa = mu.load_dict(dict_name, dict_folder)
        else:
            fa = FactorAnalyzer(n_factors=5, rotation='promax', impute='drop')
            fa.fit(X_fact)
            mu.save_dict(fa, dict_name, dict_folder)

        fa_np = fa.transform(X_fact)
        fa_df = pd.DataFrame(fa_np,
                             columns=["fa_1", "fa_2", "fa_3", "fa_4", "fa_5"])
        fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1)
        X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"])
        return X_fact
def get_fa(input_: Array,
           learn_input: Array,
           learn_weight_vec: Opt[Array],
           n_comp_list: Iterable[int],
           err_printer: Callable[[Array, Array, str], None] = None,
           normalize_x: bool = True,
           normalize_z: bool = False) -> LinearAnalyzer:
    """ The last from ``n_comp_list`` would be returned. """

    n_comp_list = list(n_comp_list)

    x = x_normalized = learn_input  # (~6000, ~162)
    weight_vec = learn_weight_vec
    μ_x: Union[Array, int] = 0
    σ_x: Union[Array, int] = 1
    if normalize_x:
        x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec)
    Σ_x = np.cov(x_normalized.T, aweights=weight_vec)  # (~162, ~162)

    for j, i in enumerate(n_comp_list):
        fa = FactorAnalyzer(n_factors=i, is_corr_matrix=True, rotation=None)
        fa.fit(Σ_x)
        fa.mean_ = np.zeros(x.shape[1])
        fa.std_ = fa.mean_ + 1.
        z = fa.transform(x_normalized)  # same as:
        # from numpy.linalg import inv
        # (~6000, ~9) = (~6000, ~162) @ ((~162, ~162) @ (~162, ~9))
        # z = ((x_normalized - 0) / 1) @ (inv(Σ_x) @ fa.structure_)

        inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z(
            z, weight_vec, normalize_z, x_normalized)

        an = LinearAnalyzer(n=fa.n_factors,
                            analyzer=fa,
                            x=input_,
                            μ_x=μ_x,
                            σ_x=σ_x,
                            μ_z=μ_z,
                            σ_z=σ_z,
                            inverse_transform_matrix=inverse_transform_matrix,
                            normalize_x=normalize_x,
                            normalize_z=normalize_z)

        if err_printer is not None:
            pref = f"Factors N = {fa.n_factors}, "
            err_printer(input_, an.x_rec, pref)

        if (j + 1) == len(n_comp_list):
            break
    else:
        raise ValueError('Empty n_comp_list')
    return an
예제 #6
0
class CompositeFATransformer(BaseEstimator, TransformerMixin):
    """
	This class takes a DataFrame and performs n-factor analysis, producing a weighted
	composite score as well as n-factors.

	Attributes
	---
	num_factors (int): The number of factors to be used for Factor Analysis
	rotation (str): The rotation to be used by the Factor Analyzer
	method (str): The method to be used by the Factor Analyzer
	"""
    def __init__(self, num_factors, rotation='varimax', method='principal'):
        self.num_factors = num_factors
        self.rotation = rotation
        self.method = method

    def get_ev(self, X):
        num_features = len(X.columns)
        fa = FactorAnalyzer(num_features, rotation=None, method=self.method)
        fa.fit(X)

        ev, v = fa.get_eigenvalues()
        return ev

    def fit(self, X, y=None):
        ev = self.get_ev(X)
        self.weighted_ev = ev[:self.num_factors] / sum(ev[:self.num_factors])

        self.fa = FactorAnalyzer(self.num_factors, self.rotation, self.method)
        self.fa.fit(X)

        return self

    def transform(self, X):
        lf = pd.DataFrame(self.fa.transform(X))
        lf.columns = ['factor_%i' % (int(x) + 1) for x in lf.columns]
        lf['composite_score'] = lf.apply(
            lambda x: np.dot(self.weighted_ev, np.array(x)), axis=1)
        lf.index = X.index
        return lf
예제 #7
0
def factors(num_FA_dim, obs, kind, prnt):
    """
    Does the Factor anlaysing/dimensionality reduction
    :param num_FA_dim:  the number of factors generating the the data with eigenvalues greater then eigenval limit
                        (integer)
    :param obs:         data to be generated by the factors (2d np.array)
    :param: kind:       0,1 or 2 depending on the data: averaged data: kind = 0, single Trial: kind = 1
                        concatenated data: kind = 2
    :return:            the factors generating the data with less dimensions
                        (2d np.array, shape:(time steps, num_FA_dim)
    """

    fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                        method='minres', n_factors=num_FA_dim, rotation=None, rotation_kwargs={},
                        use_smc=True)

    fa.fit(obs)
    obs_transformed_FA = fa.transform(obs)

    if prnt:

        if kind == 0:
            print('shape of the "obs_transformed_FA" array:     ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        elif kind == 1:
            print('shape of the "obs_transformed_FA_s" array:   ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        elif kind == 2:
            print('shape of the "obs_transformed_FA_c" array:   ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        return obs_transformed_FA
예제 #8
0
    def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df,
                                          dict_folder):
        """ RaceUmaの因子分析を行うためのデータを取得 """
        print("-- check! this is BaseTransform class: " +
              sys._getframe().f_code.co_name)
        temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード")
        X = temp_df[[
            '競走コード', '馬番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減', '斤量比',
            '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち", "外勝ち",
            "短縮勝ち", "延長勝ち", "人気勝ち"
        ]]

        mmsc_columns = ["頭数"]
        mmsc_dict_name = "sc_fa_race_mmsc"
        stdsc_columns = ["距離"]
        stdsc_dict_name = "sc_fa_race_stdsc"
        X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns,
                               stdsc_dict_name, dict_folder)

        X_fact = X.drop(["競走コード", "馬番"], axis=1)

        X_fact = X_fact.replace(np.inf,
                                np.nan).fillna(X_fact.median()).fillna(0)
        X_fact.iloc[0] = X_fact.iloc[0] + 0.000001

        dict_name = "fa_raceuma_result_df"
        filename = dict_folder + dict_name + '.pkl'
        if os.path.exists(filename):
            fa = mu.load_dict(dict_name, dict_folder)
        else:
            fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop')
            fa.fit(X_fact)
            mu.save_dict(fa, dict_name, dict_folder)

        fa_np = fa.transform(X_fact)
        fa_df = pd.DataFrame(fa_np, columns=["fa_1", "fa_2", "fa_3"])
        fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1)
        X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"])
        return X_fact
예제 #9
0
class FATransformerInPlace(BaseEstimator, TransformerMixin):
    """
	This class takes a DataFrame and converts a subet of features into a single feature 
	using 1-Factor Analysis.

	Attributes
	---
	feature_names (list): A list of features that need to be condensed
	composite_feature_name (str): Name of the new feature column
	rotation (str): The rotation to be used by the Factor Analyzer
	method (str): The method to be used by the Factor Analyzer
	"""
    def __init__(self,
                 feature_names,
                 composite_feature_name,
                 rotation='varimax',
                 method='principal'):
        self.feature_names = feature_names
        self.rotation = rotation
        self.method = method
        self.composite_feature_name = composite_feature_name

    def fit(self, X, y=None):
        fa_df = X[self.feature_names]
        self.fa = FactorAnalyzer(1, rotation=self.rotation, method=self.method)
        self.fa.fit(fa_df)

        return self

    def transform(self, X):
        X_fa = X[self.feature_names]
        lf = pd.DataFrame(self.fa.transform(X_fa))
        lf.index = X.index
        lf.columns = [self.composite_feature_name]

        df = X.drop(self.feature_names, axis=1)
        df = df.join(lf, how='left')
        return df
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer

pairs = pd.read_csv('test_pairs.csv')
test = np.load('test_set.npy')

fa = FactorAnalyzer(rotation='varimax', n_factors=512)
fa.fit(test)
test_factor = fa.transform(test)

distances = []


#from https://github.com/marcelcaraciolo/crab/blob/master/crab/similarities/similarity_distance.py
def sim_pearson(vector1, vector2, **args):
    '''
    This correlation implementation is equivalent to the cosine similarity
    since the data it receives is assumed to be centered -- mean is 0. The
    correlation may be interpreted as the cosine of the angle between the two
    vectors defined by the users' preference values.
    Parameters:
        vector1: The vector you want to compare
        vector2: The second vector you want to compare
        args: optional arguments
    The value returned is in [0,1].
    '''
    # Using Content Mode.
    if type(vector1) == type({}):
        sim = {}
        [sim.update({item: 1}) for item in vector1 if item in vector2]
예제 #11
0
    fa = FactorAnalyzer(n_factors=2, rotation="varimax", method="ml")
    fa.fit(z)

    correlation_matrix = fa.corr_
    factor_correlation_matrix = fa.phi_
    factor_loading_matrix = fa.loadings_
    rotation_matrix = fa.rotation_matrix_

    print("loadings: ", fa.loadings_.shape)

    # overall plot of factor space
    plt.figure()

    # Biplot for loading factors
    bi_plot(factor_loading_matrix, xlim=[-1.5, 1.5], ylim=[-3, 3])

    # vehicle scatter
    for i, v in enumerate(v_data):

        # do zscoring
        z = stats.zscore(v[:, :28])

        # transform to factor space
        tf = fa.transform(z)

        # plot
        plt.scatter(tf[:, 0], tf[:, 1], s=5, label=str(v_names[i]))

    plt.legend()
    plt.show()
예제 #12
0
파일: score.py 프로젝트: kfollette/QuaRCS
def score(database, semester, year, season, answer_key, savedname):
    '''
    Modified so that it uses numerical values of question/answer rather than string values.
    By:
        Ilija Nikolov, 5 March 2018
    '''

    '''
        The score function reads in a QuaRCS dataset and answer key file to create a series of columns
        to add to the dataset. The function creates columns for:
        - score on a binary scale (1 for correct, 0 for incorrect)
        - total score
        - totals and means by category
        - number of questions answered
        - total and mean confidence
        Args:
            database: pre or post QuaRCS dataset for a semester
            answer_key: QuaRCS Assessment Answer Key
            semester: 'PRE' or 'PST'
        Output:
            name of file + '_scored' as .csv file
        Example:
            score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre )
            New File saved under QuaRCS_Summer_2017_Pre_scored.csv
            Check folder for files
        By:
            Abdoulaye Sanogo, 08/11/2017
        Future Improvements:
            add columns for confidence means and totals by category
            add extra colums after insert so the deletion of columns will not be necessary
    '''

    question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q'

    data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0)
    df = pd.read_csv(answer_key, encoding = 'utf-8')


    cols = list(data.columns.values)
    c = len(cols)
    e = 0
    h = len(data)

    # Adds the Q#_SCORE column right next to each question
    questions = np.unique(df['Question #'])

    for item in questions:
        if(question+str(item) in data.columns):
            data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0)

    # e >= 50 --> Full, e < 50 --> Lite
    for d in range(c):
        column = cols[d]
        column = column[0:5]

        if question == column:
            e = e + 1

    data.insert(6 , 'VERSION', " ")

    if e == 50:
        if(year == "16" and season == "Fa"):
            data['VERSION'] = "Fl_2.0"
            # If the value "progress bar" is in comments, change the version to 2.1
            for v in range(h):
                if 'COMMENTS' in data.columns:
                    if (data.loc[v, 'COMMENTS'] == "progress bar"):
                        data.loc[v, 'VERSION'] = "Fl_2.1"
        else:
            data['VERSION'] = "Fl_1.1"
    elif e == 54:
        data['VERSION'] = "Fl_1.0"
        data = data.drop([semester + '_Q18'], axis=1)
        data = data.drop([semester + '_Q18CF'], axis=1)
        data = data.drop([semester + '_Q25'], axis=1)
        data = data.drop([semester + '_Q25CF'], axis=1)
        e = 50
    elif e == 22:
        data['VERSION'] = "Lt_1.0"
    elif e == 30:
        intyr = int(year)
        if (intyr >= 19 or (year == "18" and season == "Fa")):
            data['VERSION'] = "Lt_2.1"
        else:
            data['VERSION'] = "Lt_2.0"
    elif e == 28:
        data['VERSION'] = "SM_1.0"

    # New columns for the totals
    data[semester + '_TOTAL'] = np.nan
    data[semester + '_PCT_TOTAL'] = np.nan
    data[semester + '_GR_TOTAL'] = np.nan
    data[semester + '_GR_MEAN'] = np.nan
    data[semester + '_AR_TOTAL'] = np.nan
    data[semester + '_AR_MEAN'] = np.nan
    data[semester + '_PR_TOTAL'] = np.nan
    data[semester + '_PR_MEAN'] = np.nan
    data[semester + '_PC_TOTAL'] = np.nan
    data[semester + '_PC_MEAN'] = np.nan
    data[semester + '_SP_TOTAL'] = np.nan
    data[semester + '_SP_MEAN'] = np.nan
    data[semester + '_TR_TOTAL'] = np.nan
    data[semester + '_TR_MEAN'] = np.nan
    data[semester + '_AV_TOTAL'] = np.nan
    data[semester + '_AV_MEAN'] = np.nan
    #data[semester + '_ER_MEAN'] = np.nan
    data[semester + '_UD_TOTAL'] = np.nan
    data[semester + '_UD_MEAN'] = np.nan
    data[semester + '_ES_TOTAL'] = np.nan
    data[semester + '_ES_MEAN'] = np.nan

    # Composite Variables
    data[semester + '_SELFEFF'] = np.nan
    data[semester + '_MATHANX'] = np.nan
    data[semester + '_MATHREL'] = np.nan
    data[semester + '_ACADMAT'] = np.nan
    data[semester + '_SCHMATH'] = np.nan

    corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0,
                          32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}
    for item in corr_ans:
        corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0])

    # Adds totals and means to total and means columns
    for nn in range(h):
        qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0}

        for q_num in qn:
            try:

                if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]):

                    qn[q_num] = 1
                    data.loc[nn, question+str(q_num)+'_SCORE'] = 1
            except:
                pass


        GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]]))
        AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]]))
        PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]]))
        PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]]))
        SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]]))
        TR = int(np.nansum([qn[26], qn[27], qn[23]]))
        AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]]))
        UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]]))
        ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]]))
        data.loc[nn, semester + '_GR_TOTAL'] = GR
        data.loc[nn, semester + '_AR_TOTAL'] = AR
        data.loc[nn, semester + '_PR_TOTAL'] = PR
        data.loc[nn, semester + '_PC_TOTAL'] = PC
        data.loc[nn, semester + '_SP_TOTAL'] = SP
        data.loc[nn, semester + '_TR_TOTAL'] = TR
        data.loc[nn, semester + '_AV_TOTAL'] = AV
        data.loc[nn, semester + '_UD_TOTAL'] = UD
        data.loc[nn, semester + '_ES_TOTAL'] = ES
        total_full = 0

        for q_num in qn:
                total_full += qn[q_num]
        if e == 50:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25)
            data.loc[nn, semester + '_GR_MEAN'] = GR/6
            data.loc[nn, semester + '_AR_MEAN'] = AR/23
            data.loc[nn, semester + '_PR_MEAN'] = PR/15
            data.loc[nn, semester + '_PC_MEAN'] = PC/5
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/5
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        elif e == 22:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/9
            data.loc[nn, semester + '_PR_MEAN'] = PR/8
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_ES_MEAN'] = ES/5

        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan

        elif e == 30:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/11
            data.loc[nn, semester + '_SP_MEAN'] = SP/3
            data.loc[nn, semester + '_TR_MEAN'] = TR/3
            data.loc[nn, semester + '_AV_MEAN'] = AV/4
            data.loc[nn, semester + '_ES_MEAN'] = ES/5
        #lacks number of questions for meaningful subscore
            #1 q
            data.loc[nn, semester + '_UD_MEAN'] = np.nan
            data.loc[nn, semester + '_UD_TOTAL'] = np.nan
            #2 qs
            data.loc[nn, semester + '_PC_MEAN'] = np.nan
            data.loc[nn, semester + '_PC_TOTAL'] = np.nan

        elif e == 28:
            data.loc[nn, semester + '_TOTAL'] = total_full
            data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14)
            data.loc[nn, semester + '_GR_MEAN'] = GR/4
            data.loc[nn, semester + '_AR_MEAN'] = AR/13
            data.loc[nn, semester + '_PR_MEAN'] = PR/9
            data.loc[nn, semester + '_PC_MEAN'] = PC/3
            data.loc[nn, semester + '_SP_MEAN'] = SP/7
            data.loc[nn, semester + '_UD_MEAN'] = UD/5
            data.loc[nn, semester + '_ES_MEAN'] = ES/3

        #lacks number of questions for meaningful subscore
            #2 q
            data.loc[nn, semester + '_TR_MEAN'] = np.nan
            data.loc[nn, semester + '_TR_TOTAL'] = np.nan
            #1 q
            data.loc[nn, semester + '_AV_MEAN'] = np.nan
            data.loc[nn, semester + '_AV_TOTAL'] = np.nan



    data[semester  + '_CF_TOTAL'] = np.nan
    data[semester  + '_CF_TOTAL_CORR'] = np.nan
    data[semester  + '_CF_TOTAL_INCORR'] = np.nan
    data[semester + '_CF_MEAN'] = np.nan
    data[semester + '_CF_MEAN_CORR'] = np.nan
    data[semester + '_CF_MEAN_INCORR'] = np.nan


    # Calculates confidence totals and means; adds to respective columns
    for u in range(h):
        qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}
        qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0}

        for q_num in qcf:
            try:
                qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"])

                qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE'])
            except:
                pass

        medscore = 0
        corrscore = 0
        incorrscore = 0
        confcount = 0
        for item in qcf:
            medscore += qcf[item]

            if qcf[item] > 0:
                confcount +=1

                if qc[item] == 1:
                    corrscore += qcf[item]
                else:
                    incorrscore += qcf[item]
        #print(confcount)
        if (confcount == 0):
            confcount = 1
        # Student's score
        numcorr = data.loc[u, semester + '_TOTAL']

        # Calculate confidence scores
        if e == 30:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 22:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0
        elif e == 28:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount
            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

        elif e == 50:
            data.loc[u, semester + '_CF_TOTAL'] = medscore
            data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore
            data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore
            data.loc[u, semester + '_CF_MEAN'] = medscore/confcount

            if numcorr != 0:
                data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr
            else:
                data.loc[u, semester + '_CF_MEAN_CORR'] = 0
            if numcorr != confcount:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr)
            else:
                data.loc[u, semester + '_CF_MEAN_INCORR'] = 0

    data[semester + '_QCOMPLETE'] = 0
    data[semester + '_COMPFLAG'] = 0
    data[semester +'_EFFFLAG'] = 0

    # Counts number of completed columns
    try:
        if e == 50:
            q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35]
        elif e == 22:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16]
        elif e == 30:
            q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34]
        elif e == 28:
            q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35]

        for v in range(h):
            # Count up totals
            total = 0
            for w in q:
                count = question + str(w)

                answered = data.loc[v, count]
                if (str(answered) == 'nan' or str(answered) == ' '):
                    continue
                else:
                    total = int(np.nansum([total, 1]))

            data.loc[v, semester + '_QCOMPLETE'] = total

            # Add completed flag
            if total == len(q):
                data.loc[v, semester + '_COMPFLAG'] = 1
            else:
                data.loc[v, semester + '_COMPFLAG'] = 0
    except:
        KeyError

    # Calculating effort column

    for v in range(h):
        # If there is no response for effort, mark completion as 0 for that student!
        if (pd.isnull(data.loc[v, semester + '_EFFORT'])):
            data.loc[v, semester + '_COMPFLAG'] = 0

        # If there is high effort, give full marks in flag
        if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5:
            data.loc[v, semester +'_EFFFLAG'] = 1

        # Some effort gives you only so many marks...
        elif data.loc[v, semester + '_EFFORT'] == 3:
            data.loc[v, semester +'_EFFFLAG'] = 0.5

        # NO EFFORT!! :-(
        elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1:
            data.loc[v, semester +'_EFFFLAG'] = 0

    # Factor Analysis!
    if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28):
        # Fill out whymajs with 0 instead of NaN values so we can
        # perform FA on them
        nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3",
            semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6",
            semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1",
            semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4",
            semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7"
        ]
        for i in data.index:
            for column in nan_columns:
                if pd.isna(data.at[i, column]):
                    data.at[i, column] = 0

        # Factor Analysis variables
        att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG',
            semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2',
            semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1',
            semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1',
            semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4',
            semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3',
            semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5",
            semester + "_WHYCS_6", semester + "_EFFORT"
        ]

        # Variable selection
        att_data = data.loc[ data[semester + '_COMPFLAG']==1 ]
        att_data = att_data[att]
        # Drop all rows with NaN values
        att_data.dropna(inplace=True)

        swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2',
            '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5'
        ]
        for i in att_data.index:
            for col in swapList:
                swapOrdering(att_data, i, semester + col)

        # KMO and Barlett tests
        X = att_data.copy().values
        X = check_array(X, force_all_finite='allow-nan')

        statistic, p_value = calculate_bartlett_sphericity(X)
        print("\nBarlett sphericity p={0}".format(p_value))
        kmo_per_variable, kmo_total = calculate_kmo(X)
        print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total))

        # Create factor analysis object and perform factor analysis
        # Using maximum likelihood analysis (ml)
        n_factors = 5
        fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
        fa.fit(att_data)

        # Kaiser normalization and oblimin rotation
        rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
        loadings = rotator.fit_transform(fa.loadings_)

        # Set FA loadings to be rotator loadings
        fa.loadings_ = loadings

        # Get factor scores
        factor_scores = fa.transform(att_data)
        factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)])
        # print("\nFactor scores: \n", factor_scores)

        factor_names = ["Numerical Self Efficacy", "School Math",
            "Academic maturity", "Numerical Relevancy", "Math Anxiety"]
        # Convert factor loadings to a df
        loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)

        # Drop non-meaningful values
        loadings = loadings.where(abs(loadings) > 0.32)
        print("Factor loadings: \n", loadings)

        scores1 = factor_scores['Factor 1'].tolist()
        plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Self Efficacy")
        # plt.show()

        scores2 = factor_scores['Factor 2'].tolist()
        plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("School Math")
        # plt.show()

        scores3 = factor_scores['Factor 3'].tolist()
        plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Academic maturity")
        # plt.show()

        scores4 = factor_scores['Factor 4'].tolist()
        plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Numerical Relevancy")
        # plt.show()

        scores5 = factor_scores['Factor 5'].tolist()
        plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
        plt.title("Math Anxiety")
        # plt.show()

        # Update composite variables
        for i in factor_scores.index:
            data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1']
            data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2']
            data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3']
            data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4']
            data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5']

    #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False)

    #print("Results saved to " + savedname + "_scored.csv")

    return data
예제 #13
0
def factor_analysis(factor_df, max_feature_count=None, plot=True):
    """
    因子分析,提取N个特征,查看是否有效
    :param factor_df:
    :param max_feature_count:
    :param plot:
    :return:
    """
    ana_dic = {}
    max_feature_count = np.min(
        [factor_df.shape[1] //
         3, 50] if max_feature_count is None else max_feature_count)
    for n_features in range(2, max_feature_count):
        logger.info(f"{n_features} 个因子时:")
        fa = FactorAnalyzer(n_factors=n_features, rotation=None)
        exception = None
        for _ in range(8, 0, -1):
            df = factor_df if _ == 0 else factor_df.sample(
                factor_df.shape[0] // (_ + 1) * _)
            try:
                fa.fit(df)
                break
            except LinAlgError as exp:
                exception = exp
                logger.exception("当前矩阵 %s 存在可逆矩阵,尝试进行 %d/(%d+1) 重新采样",
                                 df.shape, _, _)
                logger.warning(exception is None)
        else:
            logger.warning(exception is None)
            raise exception from exception

        communalities = fa.get_communalities()
        logger.info(f"\t共因子方差比(communality)({communalities.shape})")  # 公因子方差
        # logger.debug('\n%s', communalities)
        loadings = fa.loadings_
        logger.info(f"\t成分矩阵,即:因子载荷(loading)({loadings.shape})")  # 成分矩阵
        # logger.debug('\n%s', loadings)  # 成分矩阵
        var = fa.get_factor_variance()  # 给出贡献率
        # 1. Sum of squared loadings (variance)
        # 2. Proportional variance
        # 3. Cumulative variance
        logger.info(f"\tCumulative variance {var[2]}")
        kmo_per_variable, kmo_total = calculate_kmo(fa.transform(factor_df))
        if kmo_total < 0.6:
            logger.info(f'\t× -> kmo_total={kmo_total:.5f} 变量间的相关性弱,不适合作因子分析')
        else:
            logger.info(
                f'\t√ -> kmo_total={kmo_total:.5f} 变量间的相关性强,变量越适合作因子分析')
        ana_dic[n_features] = {
            "FactorAnalyzer": fa,
            # "communalities": communalities,
            # "loadings": loadings,
            # "Sum of squared loadings": var[0],
            # "Proportional variance": var[1],
            "Cumulative variance": var[2][-1],
            "KOM_Test_total": kmo_total,
        }
        if var[2][-1] > 0.95 and kmo_total > 0.6:
            break

    ana_data = pd.DataFrame(
        {k: v
         for k, v in ana_dic.items() if k != 'FactorAnalyzer'}).T
    if plot:
        ana_data.plot(subplots=True, figsize=(9, 6))
        plt.show()

    return ana_dic
예제 #14
0
def get_structure_picea(data: Data,
                         loading_cutoff=None,
                        f_pref='F',
                        mod_pref='mod',
                        get_mod_full=False):
    """
    This function constructs the latent structure of a Picea (spruce) form
    :param data:
    :param loading_cutoff:
    :param f_pref:
    :param mod_pref:
    :param get_mod_full:
    :return:
    """

    d_factors = pd.DataFrame(data.d_phens)
    phen_names = data.phens
    phens_factors_all = []

    while(len(phen_names) > 2):
        loads = get_fa_loads(d_phens=d_factors.loc[:,phen_names])
        if loads is None:
            break
        phens_factors = get_factors(loads, loading_cutoff=loading_cutoff)
        if len(phens_factors) == 0:
            break

        n_f = len(phens_factors_all)
        phens_factors_all += phens_factors

        factors = []
        fa = FactorAnalyzer(n_factors=1)
        for phens in phens_factors:
            fa.fit(d_factors.loc[:, phens])
            if len(factors) == 0:
                factors = fa.transform(d_factors.loc[:, phens])
            else:
                factors = np.concatenate((factors, fa.transform(d_factors.loc[:, phens])), axis=1)

        d_factors_tmp = pd.DataFrame(factors,
                                 columns=[f'{f_pref}{i+n_f}' for i in range(factors.shape[1])],
                                     index=d_factors.index)
        phen_names = list(d_factors_tmp.columns)
        d_factors = pd.concat([d_factors, d_factors_tmp], axis=1)

    # ---------------
    # Construct descriptions of model
    s = ' + '
    mods = dict()
    for i, tmp in reversed(list(enumerate(phens_factors_all))):
        s_f = f' {f_pref}{i}' # SPACE SYMBOL AT THE BEGINNING IS IMPORTANT
        for k, m in mods.items():
            if m.find(s_f) != -1:
                mods[k] += f'\n{f_pref}{i} =~ {s.join(tmp)}'
                s_f = ''
                break

        if s_f == '':
            continue

        mods[f'{mod_pref}{i}'] = f'{f_pref}{i} =~ {s.join(tmp)}'

    if get_mod_full:
        mod_full = dict(full='')
        for k, m in mods.items():
            mod_full['full'] += '\n' + m
        return mod_full

    return mods
예제 #15
0
def add_snps_residuals(mod,
                       data: Data,
                       thresh_mlr=Hyperparams.thresh_mlr,
                       thresh_sign_snp=Hyperparams.thresh_sign_snp,
                       thresh_abs_param=Hyperparams.thresh_abs_param,
                       snp_pref=None,
                       n_iter=10):

    sem_mod = semopyModel(mod)
    sem_mod.fit(data.d_all)
    relations = sem_mod.inspect()
    relations = relations.loc[relations['op'] == '~', :]
    phens = [v for v in sem_mod.vars['all'] if v in data.phens]

    vars_ordered = sem_traversing(mod)
    vars_lat_ord = list(
        reversed([v for v in vars_ordered if v in sem_mod.vars['latent']]))

    new_var_names = []
    for f in vars_lat_ord:
        phens_f = relations.loc[relations['rval'] == f, 'lval']
        d = data.d_phens.loc[:, phens_f]

        fa = FactorAnalyzer(n_factors=1)
        fa.fit(d)
        f_val = fa.transform(d)
        f_val = f_val.transpose()[0]
        data.d_phens[f] = f_val
        new_var_names += [f]

    gwas = dict()
    snps_added = dict()
    # for variable in vars_lat_ord:
    for f in vars_lat_ord:
        print('-----------')
        mod_init = ''
        # print(variable)
        # print(mod_init)
        mod_fact, gwas[f], snps_added[f] = \
            add_snps_for_variable(mod_init, data, f,
                                      thresh_mlr=thresh_mlr,
                                      thresh_sign_snp=thresh_sign_snp,
                                      thresh_abs_param=thresh_abs_param,
                                  # n_iter=n_iter,
                                      snp_pref=snp_pref)

        sem_mod_f = semopyModel(mod_fact)
        relations_f = sem_mod_f.inspect()
        relations_f = relations_f.loc[relations_f['op'] == '~', :]

        f_val = 0
        for snp, snp_val in zip(relations_f['rval'], relations_f['Estimate']):
            f_val += data.d_snps[snp] * snp_val

        data.d_phens[f] = f_val

        print('-----------')

    return gwas, snps_added

    print(phens)
    for p in phens:
        relations_p = relations.loc[relations['lval'] == p, :]
        p_est = 0
        for var, snp_val in zip(relations_p['rval'], relations_p['Estimate']):
            p_est += data.d_all[var] * snp_val

        p_val = d.loc[:, p]
        p_res = p_val - p_est * np.dot(p_est, p_val) / np.dot(p_est, p_est)

        p_res_name = f'residual_{p}'
        data.d_phens[p_res_name] = p_res
        new_var_names += [p_res_name]

        print('-----------')
        mod_init = ''
        mod_fact, gwas[p], snps_added[p] = \
            add_snps_for_variable(mod_init, data, p_res_name,
                                      thresh_mlr=thresh_mlr,
                                      thresh_sign_snp=thresh_sign_snp,
                                      thresh_abs_param=thresh_abs_param,
                                  # n_iter=n_iter,
                                      snp_pref=snp_pref)
        print('-----------')

    data.d_phens = data.d_phens.loc[:, [
        v for v in data.d_phens.columns if v not in new_var_names
    ]]
    return gwas, snps_added
예제 #16
0
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        use_corr_matrix=False,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    use_corr_matrix : bool, optional
        Whether to use the correlation matrix.
        Defaults to False.
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    if use_corr_matrix:
        X = data.corr()
    else:
        X = data.copy()

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer(n_factors=factors,
                        method=method,
                        rotation=rotation,
                        is_corr_matrix=use_corr_matrix)
    fa.fit(X)

    evalues, values = fa.get_eigenvalues()

    return {
        'value': values,
        'evalues': evalues,
        'structure': fa.structure_,
        'loading': fa.loadings_,
        'uniquenesses': fa.get_uniquenesses(),
        'communalities': fa.get_communalities(),
        'scores': fa.transform(data)
    }
    model = sm.OLS(y_train, X2)
    fii = model.fit()
    p_values = fii.summary2().tables[1]['P>|t|']
    print("\nModel p-values: ")
    print(p_values)


# Split data before it is transformed.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df,
                                                    data['medv'],
                                                    test_size=0.3,
                                                    random_state=1)
# Transform data with factor components.
X_train_transformed = fa.fit_transform(X_train)
X_test_tranformed = fa.transform(X_test)

#----------------------------------------------------------
# Build first model
#----------------------------------------------------------
# Train regression model on training data
model = LinearRegression()
model.fit(X_train_transformed, y_train)

# Show model statistics.
showModelSummary(model, y_test, X_test_tranformed)

# Check coefficient significance.
showCoefficientPValues(y_train, X_train_transformed)

예제 #18
0
#%%
# Generate a table of task to composite score loadings
loadings = (pd.concat([loadings, eigen_values, pct_variance], axis=0).join(
    Yctrl_stats[df_].T.rename(
        index={r[0]: r[1]
               for r in zip(df_, cbs.test_names())})).loc[:, ['mean', 'std'] +
                                                          pca_names])

loadings.to_csv('./tables/Table_S3.csv')
loadings

#%% [markdown]
# ### Control Sample: Calculate Composite Cognitive Scores
#%%
# Calculates the 3 cognitive domain scores from the fitted PCA model
Zctrl[pca_names] = Ypca.transform(Zctrl[df_])

# Measure of processing speed: take the 1st Principal Component across
# timing-related features (the list of tf_)
Yspd = FactorAnalyzer(method='principal', n_factors=1,
                      rotation=None).fit(Zctrl[tf_])
Zctrl['processing_speed'] = Yspd.transform(Zctrl[tf_])

# Overall measure across CBS battery: the average of all 12 task z-scores,
# then rescale to have SD = 1.0
Zctrl['overall'] = Zctrl[df_].mean(axis=1)
Yavg_tfm = StandardScaler(with_mean=True,
                          with_std=True).fit(Zctrl[['overall']])
Zctrl['overall'] = Yavg_tfm.transform(Zctrl[['overall']])

#%% [markdown]
예제 #19
0
    # Create factor analysis object and perform factor analysis
    # Using maximum likelihood analysis (ml)
    n_factors = 5
    fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml")
    fa.fit(att_data)

    # Kaiser normalization and oblimin rotation
    rotator = Rotator(method="oblimin", normalize=True, max_iter=25)
    loadings = rotator.fit_transform(fa.loadings_)

    # Set FA loadings to be rotator loadings
    fa.loadings_ = loadings
    #print (loadings)

    # Get factor scores
    factor_scores = fa.transform(att_data)
    factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)])
    #print("\nFactor scores: \n", factor_scores)

    factor_names = ["Numerical Self Efficacy", "School Math",
        "Academic maturity", "Numerical Relevancy", "Math Anxiety"]
    # Convert factor loadings to a df
    loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)

    # Drop non-meaningful values
    loadings = loadings.where(abs(loadings) > 0.32)
    #print("Factor loadings: \n", loadings)

    scores1 = factor_scores['Factor 1'].tolist()
    plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)])
    plt.title("Numerical Self Efficacy")
예제 #20
0
def FactorAnalysis(df, rotation = "varimax", n_factors = 10, transform = False):

    """ You want "varimax" rotation if you want orthogonal (highly differentiable) with very high and low variable loading. common
        You want "oblimin" for non-orthogonal loading. Increases eigenvalues, but reduced interpretability.
        You want "promax" if you want Oblimin on large datasets.
        
        See https://stats.idre.ucla.edu/spss/output/factor-analysis/ for increased explination. 
    """   

    assert not df.isnull().values.any(), "Data must not contain any nan or inf values"
    assert all(df.std().values > 0), "Columns used in Factor Analysis must have a non-zero Std. Dev. (aka more than a single value)"  

    def data_suitable(df, kmo_value = False, ignore = False):
        
        #Test to ensure data is not identity Matrix
        chi_square_value, p_value = calculate_bartlett_sphericity(df)
        
        # test to ensure that observed data is adquite for FA. Must be > 0.6
        kmo_all, kmo_model = calculate_kmo(df)

        if (p_value > 0.1 or kmo_model < 0.6) and ignore != True:
            raise Exception("Data is not suitable for Factor Analysis!: Identity test P value: {}.  KMO model Score: {}".format(p_value, kmo_model))
        
        if kmo_value:
            return kmo_model
        else:
            return
        
        
    print("KMO Value: {}.".format(data_suitable(df, kmo_value = True)))

    fa = FactorAnalyzer(method = "minres", 
                        rotation = rotation,
                        n_factors = n_factors)

    fa.fit(df)

    def eigenplot(df):
        df = pd.DataFrame(df)
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Scatter(
                x = df.index.values,
                y = df[0].values,
                mode = 'lines'
            )
        )
        
        
        fig.add_shape(
            type = "line",
            y0 = 1,
            x0 = 0,
            y1 = 1,
            x1 = len(df),
            line = dict(
                color = 'red',
                dash = 'dash'
            )
        )
        
        fig.update_layout(
            title = "Factor Eigenvalues",
            yaxis_title="Eigenvalue",
            xaxis_title="Factor",
            xaxis = dict(
                range = [0,df[df[0] > 0].index.values[-1]]
                )
        )
        
        fig.show()
        return

    eigenplot(fa.get_eigenvalues()[1])
    Plotting.LabeledHeatmap(fa.loadings_, y = list(df.columns), title = "Factor Loading", expand = True, height = 2000, width = 2000)

    tmp = pd.DataFrame(fa.get_factor_variance()[1:]) 
    tmp.index = ["Proportional Varience","Cumulative Varience"]
    Plotting.dfTable(tmp)

    if rotation == 'promax':
        Plotting.LabeledHeatmap(fa.phi_, title = "Factor Correlation", expand = True, height = 2000, width = 2000)
        Plotting.LabeledHeatmap(fa.structure_, y = list(df.columns), title = "Variable-Factor Correlation", expand = True, height = 2000, width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_communalities()).T, 
                            title = "Varience Explained",
                            x = list(df.columns), 
                            description = "The proportion of each variables varience that can be explained by the factors.", 
                            expand = True, 
                            height = 300, 
                            width = 2000)

    Plotting.LabeledHeatmap(pd.DataFrame(fa.get_uniquenesses()).T, 
                            title = "Variable Uniqueness",
                            x = list(df.columns),
                            expand = True, 
                            height = 300,
                             width = 2000)

    if transform:
        return fa.transform(df)

    return 
예제 #21
0
        0,
        0,
        0,
        0,
        0,
        0,
        0,
    ],
],
                         columns=df_scores.columns,
                         index=['A', 'B', 'C', 'D'])

df_sample

# +
factor_scores = fa_wo_rotation.transform(df_sample)  # compute factor scores

pd.DataFrame(factor_scores,
             columns=['factor_{}'.format(i) for i in range(n_factors)],
             index=df_sample.index)
# -

# ## factor analysis with varimax rotation

fa_varimax = FactorAnalyzer(rotation='varimax', n_factors=n_factors)

# ### compute factor loadings

fa_varimax.fit(df_scores)

pd.DataFrame(fa_varimax.loadings_,
예제 #22
0
class Factor_Analyse_select(Feature):
    """
    因子分析
    """
    data = None
    selected_column = list()
    method = 'minres'

    def set_data(self, data):
        """
        传入数据
        :param data: 需要处理的数据
        :return:
        """
        self.data = data
        self.full_data = self._check_missing_value()
        self.numeric_data = self.get_numeric_data()

    """
    def __init__(self, data):
        self.set_data(data)
    
    def select_column(self, *colnames):
        for colname in colnames:
            if colname not in self.data.columns.values.tolist():
                raise ValueError("所选列不存在")
            elif colname not in self.full_data:
                raise ValueError("所选列存在缺失值")
            elif colname not in self.numeric_data:
                raise ValueError("所选列不为数值")
            elif colname in self.selected_column:
                raise ValueError("this column has been selected")
            else:
                self.selected_column.append(colname)
    """

    def set_method(self, method=None):
        """
        设置方法
        :param method: 选择的方法 str 可选:"minres", "ml", "principal"
        :return:
        """
        if method is None:
            warnings.warn("参数未设置")
        elif method not in ['minres', 'ml', 'principal']:
            raise ValueError("invalid method")
        else:
            self.method = method

    def fit(self):
        """
        按所选方法进行变换
        :return: 变换完毕的所有向量
        """
        feed_data = self.data[self.selected_column]
        self.model = FactorAnalyzer(n_factors=feed_data.shape[1],
                                    method=self.method,
                                    rotation=None)
        self.model.fit(feed_data)
        return self.model.transform(feed_data)

    def select_by_number(self, num):
        """
        选择特征值最大的num个向量
        :param num: 选择向量个数 int
        :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        if num < 0 or num > len(self.selected_column):
            raise ValueError("too many or too less columns are selected")
        temp = self.fit()
        result = pd.DataFrame(temp[:, 0:num])
        colnames = list()
        for i in range(num):
            colnames.append("FA " + str(i + 1))
        result.columns = colnames
        for i in self.data.columns.values.tolist()[::-1]:
            result.insert(0, column=i, value=self.data[[i]])
        return result

    def select_by_eig_GE_1(self):
        """
        选择特征值大于1的因子
        :return: 所有输入列和选定的因子向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        pre_list = self.model.get_eigenvalues()
        index = 0
        for i in pre_list[0]:
            if i < 1:
                break
            index += 1
        temp = self.fit()
        result = pd.DataFrame(temp[:, 0:index])
        colnames = list()
        for i in range(index):
            colnames.append("FA " + str(i + 1))
        result.columns = colnames
        for i in self.data.columns.values.tolist()[::-1]:
            result.insert(0, column=i, value=self.data[[i]])
        return result

    def _select_by(self, **type_arg):
        """
        按输入参数返回因子分析结果
        :param type_arg: 控制变量字典
        字典中"method": 因子分析的方法 "minres":最小残差法(默认), "ml":极大似然, "principal":主成分分析
        字典中"type" == 0: 按数量选择结果, typearg: 选择特征值最大的typearg个因子
        字典中"type" == 1: 选择特征值大于1的所有向量
        :return: 所有输入列和分箱结果向量组成的数据框(包含输入表的所有数据) pandas.dataFrame
        """
        if "method" in type_arg.keys():
            self.set_method(type_arg["method"])
        if type_arg["type"] == 0:
            self.select_column(*type_arg["columns"])
            return self.select_by_number(type_arg["typearg"])
        elif type_arg["type"] == 1:
            self.select_column(*type_arg["columns"])
            return self.select_by_eig_GE_1()
        else:
            raise ValueError("type error:不存在所选类")