예제 #1
0
    def get_ev(self, X):
        num_features = len(X.columns)
        fa = FactorAnalyzer(num_features, rotation=None, method=self.method)
        fa.fit(X)

        ev, v = fa.get_eigenvalues()
        return ev
예제 #2
0
def eigenvalues_plt(data):
    img = io.BytesIO()

    plt.switch_backend('Agg')

    plt.style.use('ggplot')
    fa = FactorAnalyzer()
    fa.fit(data)
    eigen_values, vectors = fa.get_eigenvalues()
    plt.figure(figsize=(10, 10))
    plt.scatter(range(1, data.shape[1] + 1), eigen_values)
    plt.plot(range(1, data.shape[1] + 1), eigen_values)
    plt.title('Factor Importance by Eigenvalues')

    plt.xlabel('Factors')
    plt.ylabel('Eigenvalue')
    plt.grid()

    plt.savefig(img, format='png')

    img.seek(0)
    graph_url = base64.b64encode(img.getvalue()).decode()
    plt.close()

    return 'data:image/png;base64,{}'.format(graph_url)
예제 #3
0
def get_fa_loads(d_phens,
                 kmo_threshold=0.6,
                 bartlett_threshold=0.05,
                 n_shuffle=100,
                 test_factorability=False):
    """
    Get factors
    :param d_phens:
    :param loading_thresh:
    :param kmo_threshold:
    :param bartlett_threshold:
    :param n_shuffle:
    :param test_factorability:
    :return:
    """

    # Evaluation of the “factorability” of phenotypes
    if test_factorability:
        _, bartlett_value = calculate_bartlett_sphericity(d_phens)
        _, kmo_model = calculate_kmo(d_phens)
        if (kmo_model < kmo_threshold) or (bartlett_value > bartlett_threshold):
            # raise ValueError('Phenotypic data does not contain factors')
            warnings.warn('\nPhenotypic data does not contain factors')
            return None

    # Define the number of afctors by parallel analysis
    n_factors = pa(d_phens, n_shuffle)

    # factor analysis
    fa = FactorAnalyzer(n_factors=n_factors)
    fa.fit(d_phens)

    loads = pd.DataFrame(data=fa.loadings_, index=d_phens.columns)

    return loads
예제 #4
0
    def get_loadings(self,
                     by: CUTOFF_METHOD = "scree",
                     threshold: float = 1,
                     n_factors: Optional[int] = None,
                     is_filter: bool = False) -> pd.DataFrame:
        """
        Get PCA loading dataframe

        :param by:
            Cutoff method using Cummulative Variance Plot or Scree Plot
        :param threshold:
            Percentage of variance explained, default 80%
        :param n_factors:
            Number of factors.
        :param is_filter:
            If False will show all PCA loadings heatmap. If True, will only show attributes with cells > 0.55.
        """
        df = self.df
        _factors = self._get_factors(
            by, threshold) if n_factors is None else n_factors
        fa = FactorAnalyzer(rotation='varimax',
                            n_factors=_factors,
                            method='ml')
        fa.fit(df)
        fa_loading_matrix = pd.DataFrame(
            fa.loadings_, columns=[f'FA{i}' for i in range(1, _factors + 1)])
        return self._process_loading_matrix(fa_loading_matrix, is_filter)
예제 #5
0
def plotfig(cols):
    c = df1.corr()
    xa = df1[df1.columns[2:7]]
    fa = FactorAnalyzer()
    fa.fit(xa, 10)  #Get Eigen values and plot them
    ev, v = fa.get_eigenvalues()
    ev
    #plt.plot(range(1,xa.shape[1]+1),ev)
    fig = px.scatter(x=range(1, xa.shape[1] + 1), y=ev)
    fig.update_traces(mode='lines+markers')

    fig.update_layout(yaxis={'visible': True, 'showticklabels': True})
    fig.update_layout(xaxis={'visible': True, 'showticklabels': True})
    fig.update_layout(width=700, height=200, plot_bgcolor='rgb(255,255,255)')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#dddddd')
    fig.update_xaxes(showline=True, linewidth=1, linecolor='black')
    fig.update_yaxes(showline=True, linewidth=1, linecolor='black')

    fig['layout'].update(margin=dict(l=0, r=20, b=20, t=10))
    fig.update_traces(line=dict(color="#0863ae"))
    fig.update_layout(xaxis_title="X",
                      yaxis_title="Y",
                      legend_title="Factor Analysis",
                      font=dict(family="Courier New, monospace",
                                size=12,
                                color="black"))
    return fig
예제 #6
0
def make_loadings_matrix(rating_m):
    '''Takes a rating matrix and returns the loading matrix. Optimized for number of components
    using the knee, with a oblimin rotation for interpretability
    '''
    # Fit the initial factor analysis
    fa = FactorAnalyzer(n_factors=10, rotation='oblimin')
    fa.fit(rating_m)
    x = list(range(1, 16))
    fa_eigens = fa.get_eigenvalues()[1]
    fa_matrix_knee = KneeLocator(x,
                                 fa_eigens,
                                 S=1.0,
                                 curve='convex',
                                 direction='decreasing')
    fa_knee = fa_matrix_knee.knee
    fa_kneed = FactorAnalyzer(n_factors=fa_knee,
                              rotation='varimax').fit(rating_m)
    loadings_m = pd.DataFrame(fa_kneed.loadings_.round(2))
    loadings_m.index = get_construct_names()
    loadings_m.index = loadings_m.index.rename(name='Construct')
    loadings_m.columns = [
        'Factor {} ({:.0f}%)'.format(
            i + 1,
            fa_kneed.get_factor_variance()[1][i] * 100)
        for i in loadings_m.columns
    ]
    return loadings_m
예제 #7
0
    def factor_analysis(self, input_x):
        ss_x = StandardScaler().fit_transform(input_x)
        norm_x = normalize(input_x, axis=0)
        factor_number = 9
        fa = FactorAnalyzer(
            n_factors=factor_number,
            rotation='oblimin')  # oblimin/promax varimax:orthogonal
        fa.fit(ss_x)
        ev, v = fa.get_eigenvalues()
        factor_loading_matrix = fa.loadings_
        fa_score = fa.transform(ss_x)
        print('ev', ev)
        # print('v',v)
        # print('factor_loading_matrix',factor_loading_matrix)
        fa_name = list(self.table_data.columns[1::])
        # print('quantization_score', len(fa_name),fa_name)
        for i in range(factor_number):
            all_coefficients = np.sort(factor_loading_matrix[:, i])
            coefficients_index = np.argsort(factor_loading_matrix[:, i])
            print('factor_i', i)
            for j, coefficient in enumerate(all_coefficients):
                if coefficient > 0.5:
                    print('coefficients_index', coefficients_index[j],
                          fa_name[coefficients_index[j]])

        plt.scatter(range(1, input_x.shape[1] + 1), ev)
        plt.plot(range(1, input_x.shape[1] + 1), ev)
        plt.title('scree figure')
        plt.ylabel('eigenvalues')
        plt.grid()
        plt.show()

        return fa_score
예제 #8
0
def numbFactorsTest(X, m=1, met='ml', alfa=0.05):  #met='principal','minres'
    n, p = X.shape
    R = np.corrcoef(np.transpose(X))
    p_val = 0

    fa = FactorAnalyzer(method=met,
                        rotation='varimax',
                        n_factors=m,
                        is_corr_matrix=False)
    fa.fit(X)
    l = fa.loadings_
    ll = l @ l.T
    fi = np.diag(R) - np.diag(ll)
    Sg = ll + np.diag(fi)

    l = 1 / 2 * (2 * p + 1 - (8 * p + 1)**0.5)
    if m < l:
        df = (((p - m)**2) - (p + m)) * 1 / 2
        vt = (n - 1 - (2 * p + 4 * m + 5) / 6) * np.log(
            np.linalg.det(Sg) / np.linalg.det(R))
        vc = stats.chi2.ppf(1 - alfa, df)
        p_val = stats.chi2.pdf(vt, df, 1 - alfa)  #p-value
        if vt > vc:  #se rechaza H0
            H0 = False
        else:
            H0 = True
    else:
        H0 = False

    cumVar = fa.get_factor_variance()[2][-1]
    return (H0, p_val, cumVar)


#%%
예제 #9
0
def factors_lst(number_factors, lst_obs, prnt):
    """
    Does the Factor anlaysing/dimensionality reduction for a list of observations with a loop ober that list
    :param number_factors:  the number of factors to be taken (the reduced dimensionality), has to be the same for
                            all the list elements (integer)
    :param lst_obs:         list,  elements hold the dF/F for the Mouse/sessions/trial type of one specific trial
    :return:                list, elements hold the transformed observations. the shape of the elements is now
                            (time steps, number of factors)
    """

    lst_obs_transformed = []

    for item in lst_obs:

        fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                            method='minres', n_factors=number_factors, rotation=None, rotation_kwargs={},
                            use_smc=True)

        fa.fit(item)
        obs_transformed = fa.transform(item)

        lst_obs_transformed.append(obs_transformed)

    if prnt == True:

        print()
        print('shape of one element of the list')
        print('after the dim reduction:                  ', np.shape(lst_obs_transformed[0]))
        print('number of time steps:                     ', lst_obs_transformed[0].shape[0])
        print('number of dimensions:                     ', lst_obs_transformed[0].shape[1])

    return lst_obs_transformed
예제 #10
0
def find_number_of_Factors_1(eigenval_limit, dimensions, obs,  kind, prnt):

    """this function calculates the number of factors with an Eigenvalue which is greater then the 'eigenval_limit,
    without the param trial_index
        :param   eigenval_limit: number (float) , recommended = 1.0
                 dimensions:     dimensions before dimensionality reduction (obs.shape[1])
                 obs:            2 dim array holding the averaged data
                 kind:           0, if data is averaged, 1 if data is single trial, 2 if data is concatenated
        :return: the number of factors generating the the data with eigenvalues greater then eigenval limit
                  """

    fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                        method='minres', n_factors=dimensions, rotation=None, rotation_kwargs={},
                        use_smc=True)

    fa.fit(obs)
    eigenvals, x = fa.get_eigenvalues()

    # take the eigenvals >= 1 --> number of them = number of relevant factors
    num_FA_dim = len(eigenvals[eigenvals >= eigenval_limit])

    if prnt:

        if kind == 0:
            print('averaged:')
            print('Number of Factors:                           ', num_FA_dim)


        elif kind == 2:
            print('concatenated:')
            print('Number of Factors:                           ', num_FA_dim)

    return num_FA_dim
예제 #11
0
def get_MFA_params(zl, kl, rl_nextl):
    ''' Determine clusters with a GMM and then adjust a Factor Model over each cluster
    zl (ndarray): The lth layer latent variable 
    kl (int): The number of components of the lth layer
    rl_nextl (1darray): The dimension of the lth layer and (l+1)th layer
    -----------------------------------------------------
    returns (dict): Dict with the parameters of the MFA approximated by GMM + FA. 
    '''
    #======================================================
    # Fit a GMM in the continuous space
    #======================================================
    numobs = zl.shape[0]

    not_all_groups = True
    max_trials = 100
    empty_count_counter = 0

    while not_all_groups:
        # If not enough obs per group then the MFA diverge...

        gmm = GaussianMixture(n_components=kl)
        s = gmm.fit_predict(zl)

        clusters_found, count = np.unique(s, return_counts=True)

        if (len(clusters_found) == kl):  # & (count >= 5).all():
            not_all_groups = False

        empty_count_counter += 1
        if empty_count_counter >= max_trials:
            raise RuntimeError(
                'Could not find a GMM init that presents the \
                               proper number of groups:', kl)

    psi = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    psi_inv = np.full((kl, rl_nextl[0], rl_nextl[0]), 0).astype(float)
    H = np.full((kl, rl_nextl[0], rl_nextl[1]), 0).astype(float)
    eta = np.full((kl, rl_nextl[0]), 0).astype(float)
    z_nextl = np.full((numobs, rl_nextl[1]), np.nan).astype(float)

    #========================================================
    # And then a MFA on each of those group
    #========================================================

    for j in range(kl):
        indices = (s == j)
        fa = FactorAnalyzer(rotation=None, method='ml', n_factors=rl_nextl[1])
        fa.fit(zl[indices])

        psi[j] = np.diag(fa.get_uniquenesses())
        H[j] = fa.loadings_
        psi_inv[j] = np.diag(1 / fa.get_uniquenesses())
        z_nextl[indices] = fa.transform(zl[indices])

        eta[j] = np.mean(zl[indices], axis=0)

    params = {'H': H, 'psi': psi, 'z_nextl': z_nextl, 'eta': eta, 'classes': s}
    return params
예제 #12
0
 def _get_variance_info(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """
     Return a Tuple consisting of 3 arrays:
     1. Sum of squared loadings (variance)
     2. Proportional variance
     3. Cumulative variance
     """
     fa = FactorAnalyzer(rotation=None)
     fa.fit(self.df.dropna())
     return fa.get_factor_variance()
예제 #13
0
 def FA(self):
     fa = FactorAnalyzer(n_factors=1, method="principal", rotation="varimax")
     fa.fit(self.df)
     # Print eigenvalues
     ev, v = fa.get_eigenvalues()
     print(ev)
     # Print loadings
     print(fa.loadings_)
     self.coeff = fa.loadings_
     return 0
예제 #14
0
파일: lb_v2.py 프로젝트: ikem55/HRsystem
    def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df,
                                          dict_folder):
        """ RaceUmaの因子分析を行うためのデータを取得 """
        print("factory_analyze_raceuma_result_df")
        temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード")
        X = temp_df[[
            '競走コード', '馬番', '枠番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減',
            '斤量比', '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち",
            "外勝ち", "短縮勝ち", "延長勝ち", "人気勝ち", "1番人気", "3角先頭", "4角先頭", "上がり最速",
            "上がりタイム", "連闘", "休み明け", "大差負け", "展開脚質", "展開脚色"
        ]]

        mmsc_columns = ["頭数", "展開脚質", "展開脚色", "上がりタイム"]
        mmsc_dict_name = "sc_fa_race_mmsc"
        stdsc_columns = ["距離"]
        stdsc_dict_name = "sc_fa_race_stdsc"
        X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns,
                               stdsc_dict_name, dict_folder)

        X_fact = X.drop(["競走コード", "馬番"], axis=1).astype({
            '非根幹': int,
            '逃げ勝ち': int,
            '内勝ち': int,
            '外勝ち': int,
            '短縮勝ち': int,
            '延長勝ち': int,
            '人気勝ち': int,
            '1番人気': int,
            '3角先頭': int,
            '4角先頭': int,
            '上がり最速': int,
            '休み明け': int,
            '連闘': int,
            '大差負け': int
        })

        X_fact = X_fact.replace(np.inf,
                                np.nan).fillna(X_fact.median()).fillna(0)
        X_fact.iloc[0] = X_fact.iloc[0] + 0.000001

        dict_name = "fa_raceuma_result_df"
        filename = dict_folder + dict_name + '.pkl'
        if os.path.exists(filename):
            fa = mu.load_dict(dict_name, dict_folder)
        else:
            fa = FactorAnalyzer(n_factors=5, rotation='promax', impute='drop')
            fa.fit(X_fact)
            mu.save_dict(fa, dict_name, dict_folder)

        fa_np = fa.transform(X_fact)
        fa_df = pd.DataFrame(fa_np,
                             columns=["fa_1", "fa_2", "fa_3", "fa_4", "fa_5"])
        fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1)
        X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"])
        return X_fact
예제 #15
0
    def factor_analysis_perdomain(self, write=False):
        '''
        Function to perform factor analysis per domain
        Args: 
            write: Boolean variable to choose whether to write the output to a file or not. 
        Returns:
            None
        '''
        # load domains into domain data
        domain_data = self.load_domains()

        # Get mapping of original numbers to shuffled numbers
        true_num = list(pd.read_csv(self.data, index_col=0))
        for d in self.domains:
            curr = domain_data[d]
            col = list(curr)
            ind = []
            for c in col:
                ind.append(true_num.index(c))
            index_num = list(map(int, ind))
            column_name = list(np.array(self.extract_features())[index_num])

            inp = domain_data[d]
            fa = FactorAnalyzer(n_factors=self.n, rotation='varimax')

            # In some cases, factor analysis does not success
            try:
                fa.fit(inp)
            except:
                print(
                    'Data from ' + str(d) +
                    ' domain cannot be factorized as it results in a singular matrix.'
                )
                continue
            magnitude = fa.get_communalities()

            mag_dict = {}
            for i, _ in enumerate(column_name):
                mag_dict[column_name[i]] = magnitude[i]

            sorted_mag = sorted(mag_dict.items(),
                                key=lambda kv: kv[1],
                                reverse=True)

            if write == True:
                factors = pd.DataFrame(sorted_mag,
                                       columns=['Feature', 'Importance'])
                if self.DC:
                    factors.to_csv('output/fa_decorrelated_' + str(d) + '_' +
                                   str(self.VER) + '.csv',
                                   index=False)
                else:
                    factors.to_csv('output/fa_' + str(d) + '_' +
                                   str(self.VER) + '.csv',
                                   index=False)
예제 #16
0
def get_s(e_square):

    fa = FactorAnalyzer(n_factors=1, rotation="varimax")
    fa.fit(e_square)
    loadings = fa.loadings_

    tmp = (loadings - min(loadings)) / (max(loadings) - min(loadings))
    s = tmp / sum(tmp**2)
    s_prime = s / np.linalg.norm(s)

    return s_prime
def get_fa(input_: Array,
           learn_input: Array,
           learn_weight_vec: Opt[Array],
           n_comp_list: Iterable[int],
           err_printer: Callable[[Array, Array, str], None] = None,
           normalize_x: bool = True,
           normalize_z: bool = False) -> LinearAnalyzer:
    """ The last from ``n_comp_list`` would be returned. """

    n_comp_list = list(n_comp_list)

    x = x_normalized = learn_input  # (~6000, ~162)
    weight_vec = learn_weight_vec
    μ_x: Union[Array, int] = 0
    σ_x: Union[Array, int] = 1
    if normalize_x:
        x_normalized, μ_x, σ_x = get_x_normalized_μ_σ(x, weight_vec)
    Σ_x = np.cov(x_normalized.T, aweights=weight_vec)  # (~162, ~162)

    for j, i in enumerate(n_comp_list):
        fa = FactorAnalyzer(n_factors=i, is_corr_matrix=True, rotation=None)
        fa.fit(Σ_x)
        fa.mean_ = np.zeros(x.shape[1])
        fa.std_ = fa.mean_ + 1.
        z = fa.transform(x_normalized)  # same as:
        # from numpy.linalg import inv
        # (~6000, ~9) = (~6000, ~162) @ ((~162, ~162) @ (~162, ~9))
        # z = ((x_normalized - 0) / 1) @ (inv(Σ_x) @ fa.structure_)

        inverse_transform_matrix, μ_z, σ_z = get__inverse_transform_matrix__μ_z__σ_z(
            z, weight_vec, normalize_z, x_normalized)

        an = LinearAnalyzer(n=fa.n_factors,
                            analyzer=fa,
                            x=input_,
                            μ_x=μ_x,
                            σ_x=σ_x,
                            μ_z=μ_z,
                            σ_z=σ_z,
                            inverse_transform_matrix=inverse_transform_matrix,
                            normalize_x=normalize_x,
                            normalize_z=normalize_z)

        if err_printer is not None:
            pref = f"Factors N = {fa.n_factors}, "
            err_printer(input_, an.x_rec, pref)

        if (j + 1) == len(n_comp_list):
            break
    else:
        raise ValueError('Empty n_comp_list')
    return an
예제 #18
0
def fn_biplot_rev(data, col_ind_1, col_ind_2, xlim_lb, xlim_ub, ylim_lb, ylim_ub, labels=None):
 # function
 datavalues = data.copy()
 #     datavalues = data.values

 col_1 = col_ind_1 - 1
 col_2 = col_ind_2 - 1

 xs = datavalues[:, col_1]
 ys = datavalues[:, col_2]
 n = datavalues.shape[1]
 xs_count = len(datavalues)

 scalex = 1.0 / (xs.max() - xs.min())
 scaley = 1.0 / (ys.max() - ys.min())
 plt.scatter(xs, ys, color='r')

 ts = []
 for i in range(xs_count):
  ts.append(plt.text(xs[i], ys[i], 'Q' + str(i + 1)))

 plt.xlim(xlim_lb, xlim_ub)
 plt.ylim(ylim_lb, ylim_ub)

 plt.xlabel("FA{}".format(col_ind_1))
 plt.ylabel("FA{}".format(col_ind_2))
 plt.grid()

 # plot
 fn_biplot_rev(ML_result_2_fa, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1)
 plt.show()

 #  orthogonal rotation : varimax  rotation을 varimax로 지정하면 직교회전
 fa = FactorAnalyzer(n_factors=2, rotation='varimax', method="ML")
 fa.fit(data_df)
 ML_varimax_2_fa = fa.loadings_
 ML_varimax_2_fa

 # plot
 ML_varimax_2_fa_mat = ML_varimax_2_fa
 fn_biplot_rev(ML_varimax_2_fa_mat, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1)
 plt.show()

 #  oblique rotation : promax
 fa = FactorAnalyzer(n_factors=2, rotation='promax', method="ML")
 fa.fit(data_df)
 ML_promax_2_fa = fa.loadings_
 ML_promax_2_fa

 # plot
 fn_biplot_rev(ML_promax_2_fa, col_ind_1=1, col_ind_2=2, xlim_lb=-1, xlim_ub=1, ylim_lb=-1, ylim_ub=1)
 plt.show()
예제 #19
0
def best_num_factors(df):
    fa = FactorAnalyzer(bounds=(0.005, 1),
                        impute='median',
                        is_corr_matrix=False,
                        method='minres',
                        n_factors=10,
                        rotation=None,
                        rotation_kwargs={},
                        use_smc=True)
    fa.fit(df)
    ev, v = fa.get_eigenvalues()
    num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()])
    return num_f
예제 #20
0
 def FactorAnalyze(self, rotate="varimax"):
     self.SharedVariance = copy.deepcopy(self.NormSubThresh)
     self.SharedVariance = self.SharedVariance.iloc[:1]
     self.SharedVariance.index.rename("Shared Variance", inplace=True)
     self.FactorLoadings = copy.deepcopy(self.NormSubThresh)
     self.FactorLoadings = self.FactorLoadings.iloc[:3]
     self.FactorLoadings.index.rename("Factor Number", inplace=True)
     for key in self.SharedVariance.columns.unique(level=0):
         factor = FactorAnalyzer(n_factors=3, rotation=rotate)
         factor.fit(sklearn.preprocessing.StandardScaler().fit_transform(
             self.NormSubThresh[key].values))
         self.SharedVariance[key] = np.atleast_2d(
             factor.get_communalities())
         self.FactorLoadings[key] = factor.loadings_.T
예제 #21
0
def show_num_factors(df):
    fa = FactorAnalyzer(bounds=(0.005, 1),
                        impute='median',
                        is_corr_matrix=False,
                        method='minres',
                        n_factors=10,
                        rotation='varimax',
                        rotation_kwargs={},
                        use_smc=True)
    fa.fit(df)
    ev, v = fa.get_eigenvalues()
    num_f = len([e for e in ev if e > ev.mean() + 2 * ev.std()])
    res_f = len([e for e in ev if e > 1])
    return f"Best number of factors: {num_f}. Other possible factors {res_f-num_f}"
예제 #22
0
def fac_an(df, n_factors, name):
    drop_nn(df)
    fa = FactorAnalyzer(bounds=(0.005, 1),
                        impute='median',
                        is_corr_matrix=False,
                        method='minres',
                        n_factors=n_factors,
                        rotation='varimax',
                        rotation_kwargs={},
                        use_smc=True)
    fa.fit(df)
    load = pd.DataFrame.from_records(
        fa.loadings_,
        columns=([f'{name}' + str(i + 1) for i in range(n_factors)]))
    load['item#'] = df.columns
    return load
예제 #23
0
    def factor_analysis(self, *x_columns: str, n_factor:int=None) -> dict:
        """因子分析

        :param x_column: x因子所在的列名
        :param n_factor: 公因子个数(可手动设置,默认为自动)
        :return: 字典,包括公因子方差、成分矩阵和解释的总方差
        """
        columns = []
        for x in x_columns:
            columns.append(x)
        X_data = pd.DataFrame(self.data, columns=columns)
        if n_factor is not None:
            fa = FactorAnalyzer(method="principal", n_factors=n_factor)
        else:
            fa = FactorAnalyzer(method="principal")
        fa.fit(X_data)
        result_dict = dict()
        result_dict['communalities'] = fa.get_communalities().tolist()
        result_dict['component_matrix'] = fa.loadings_.tolist()
        result_dict['factor_variance'] = [arr.tolist() for arr in fa.get_factor_variance()]
        return result_dict
예제 #24
0
class CompositeFATransformer(BaseEstimator, TransformerMixin):
    """
	This class takes a DataFrame and performs n-factor analysis, producing a weighted
	composite score as well as n-factors.

	Attributes
	---
	num_factors (int): The number of factors to be used for Factor Analysis
	rotation (str): The rotation to be used by the Factor Analyzer
	method (str): The method to be used by the Factor Analyzer
	"""
    def __init__(self, num_factors, rotation='varimax', method='principal'):
        self.num_factors = num_factors
        self.rotation = rotation
        self.method = method

    def get_ev(self, X):
        num_features = len(X.columns)
        fa = FactorAnalyzer(num_features, rotation=None, method=self.method)
        fa.fit(X)

        ev, v = fa.get_eigenvalues()
        return ev

    def fit(self, X, y=None):
        ev = self.get_ev(X)
        self.weighted_ev = ev[:self.num_factors] / sum(ev[:self.num_factors])

        self.fa = FactorAnalyzer(self.num_factors, self.rotation, self.method)
        self.fa.fit(X)

        return self

    def transform(self, X):
        lf = pd.DataFrame(self.fa.transform(X))
        lf.columns = ['factor_%i' % (int(x) + 1) for x in lf.columns]
        lf['composite_score'] = lf.apply(
            lambda x: np.dot(self.weighted_ev, np.array(x)), axis=1)
        lf.index = X.index
        return lf
예제 #25
0
파일: cityData.py 프로젝트: pyonSec/city
    def FA(self):
        print(self.df.columns)
        print(self.mean)
        print(self.sigma)
        '''
        print(self.df)
        chi_square_value,p_value=calculate_bartlett_sphericity(self.df)
        print(chi_square_value, p_value)
        # Bartlett ’s test, the p-value is 0. The test was statistically significant, indicating that the observed correlation matrix is not an identity matrix.
        kmo_all,kmo_model=calculate_kmo(self.df)
        print(kmo_model)
        # Kaiser-Meyer-Olkin (KMO) Test measures the suitability of data for factor analysis.
        fa = FactorAnalyzer(n_factors=self.df.shape[1], rotation=None)
        fa.fit(self.df)
        # Check Eigenvalues
        ev, v = fa.get_eigenvalues()
        print(ev)
        '''
        fa = FactorAnalyzer(n_factors=1,
                            method="principal",
                            rotation="varimax")
        fa.fit(self.df)
        # Print eigenvalues
        ev, v = fa.get_eigenvalues()
        print(ev)

        # Print loadings
        print(fa.loadings_)
        #print(fa.transform(self.df))
        '''
        plt.scatter(range(1,self.df.shape[1]+1),ev)
        plt.plot(range(1,self.df.shape[1]+1) ,ev)
        plt.title('Scree Plot')
        plt.xlabel('Factors')
        plt.ylabel('Eigenvalue')
        plt.grid()
        plt.savefig('{}.png'.format(self.indicatorName))
        plt.close()
        '''
        return 0
예제 #26
0
def factors(num_FA_dim, obs, kind, prnt):
    """
    Does the Factor anlaysing/dimensionality reduction
    :param num_FA_dim:  the number of factors generating the the data with eigenvalues greater then eigenval limit
                        (integer)
    :param obs:         data to be generated by the factors (2d np.array)
    :param: kind:       0,1 or 2 depending on the data: averaged data: kind = 0, single Trial: kind = 1
                        concatenated data: kind = 2
    :return:            the factors generating the data with less dimensions
                        (2d np.array, shape:(time steps, num_FA_dim)
    """

    fa = FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
                        method='minres', n_factors=num_FA_dim, rotation=None, rotation_kwargs={},
                        use_smc=True)

    fa.fit(obs)
    obs_transformed_FA = fa.transform(obs)

    if prnt:

        if kind == 0:
            print('shape of the "obs_transformed_FA" array:     ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        elif kind == 1:
            print('shape of the "obs_transformed_FA_s" array:   ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        elif kind == 2:
            print('shape of the "obs_transformed_FA_c" array:   ', np.shape(obs_transformed_FA))
            print('number of time steps:                        ', obs_transformed_FA.shape[0])
            print('number of dimensions:                        ', obs_transformed_FA.shape[1])
            print()

        return obs_transformed_FA
예제 #27
0
    def factory_analyze_raceuma_result_df(self, race_df, input_raceuma_df,
                                          dict_folder):
        """ RaceUmaの因子分析を行うためのデータを取得 """
        print("-- check! this is BaseTransform class: " +
              sys._getframe().f_code.co_name)
        temp_df = pd.merge(input_raceuma_df, race_df, on="競走コード")
        X = temp_df[[
            '競走コード', '馬番', 'タイム指数', '単勝オッズ', '先行率', 'ペース偏差値', '距離増減', '斤量比',
            '追込率', '平均タイム', "距離", "頭数", "非根幹", "上り係数", "逃げ勝ち", "内勝ち", "外勝ち",
            "短縮勝ち", "延長勝ち", "人気勝ち"
        ]]

        mmsc_columns = ["頭数"]
        mmsc_dict_name = "sc_fa_race_mmsc"
        stdsc_columns = ["距離"]
        stdsc_dict_name = "sc_fa_race_stdsc"
        X = mu.scale_df_for_fa(X, mmsc_columns, mmsc_dict_name, stdsc_columns,
                               stdsc_dict_name, dict_folder)

        X_fact = X.drop(["競走コード", "馬番"], axis=1)

        X_fact = X_fact.replace(np.inf,
                                np.nan).fillna(X_fact.median()).fillna(0)
        X_fact.iloc[0] = X_fact.iloc[0] + 0.000001

        dict_name = "fa_raceuma_result_df"
        filename = dict_folder + dict_name + '.pkl'
        if os.path.exists(filename):
            fa = mu.load_dict(dict_name, dict_folder)
        else:
            fa = FactorAnalyzer(n_factors=3, rotation='promax', impute='drop')
            fa.fit(X_fact)
            mu.save_dict(fa, dict_name, dict_folder)

        fa_np = fa.transform(X_fact)
        fa_df = pd.DataFrame(fa_np, columns=["fa_1", "fa_2", "fa_3"])
        fa_df = pd.concat([X[["競走コード", "馬番"]], fa_df], axis=1)
        X_fact = pd.merge(input_raceuma_df, fa_df, on=["競走コード", "馬番"])
        return X_fact
예제 #28
0
    def fit(self, n_factors=3, rotation='varimax'):
        '''
        Parameters
        ----------
        n_factors : int, optional, (default:3)
        \t The number of factors to select
        
        rotation : str, optional, (default:'varimax')
        \t The type of rotation to perform after fitting 
        \t the factor analysis model
        
        \t Rotation Methods
        \t (a) varimax : orthogonal rotation
        \t (b) promax  : oblique rotation
        \t (c) oblimin : oblique rotation
        \t (d) oblimax : orthogonal rotation
        \t (e) quartimin : oblique rotation
        \t (f) quartimax : orthogonal rotation
        \t (g) equamax : orthogonal rotation
        
        Returns
        -------
        self.variance : array of floats
        \t Calculate the factor variance information, 
        \t including variance, proportional variance and 
        \t cumulative variance for each factor

        self.loadings_ : array of floats, 
        of shape(n_factors, n_factors)
        \t The factor loadings matrix
        '''
        self.n_factors, self.rotation = n_factors, rotation
        kwargs = dict(n_factors=n_factors,
                      rotation=rotation,
                      is_corr_matrix=True)
        fa = FactorAnalyzer(**kwargs)
        fa.fit(self.loadings)
        self.variance = fa.get_factor_variance()
        self.loadings_ = fa.loadings_
def factor_analyzer(n_factors):
    fa = FactorAnalyzer(n_factors=n_factors, rotation=None)
    fa_fit_out = fa.fit(cars_ar)
    fa_communalities = fa_fit_out.get_communalities()
    fa_gof = sum(fa_communalities)
    fa_scores = fa_fit_out.transform(cars_ar)
    fa_factor_loadings = fa_fit_out.loadings_
    return {
        'fa_gof': fa_gof,
        'fa_communalities': fa_communalities,
        'fa_scores': fa_scores,
        'fa_factor_loadings': fa_factor_loadings
    }
예제 #30
0
    def factor_analysis(self, write=False):
        '''
        Function to get the features and their importance in factor analysis
        Args: 
            write: Boolean variable to choose whether to write the output to a file or not. 
        Returns: 
            inp: pandas DataFrame that contains the original data 
            sorted_mag: list of tuples to store the features and their importance in decreasing format
        '''
        inp = pd.read_csv(self.data, index_col=0)

        # fa stores the output of the factor_analyzer
        fa = FactorAnalyzer(n_factors=self.n, rotation='varimax')

        # fits the input to get feature importances
        fa.fit(inp)

        # gets the factor loadings
        magnitude = fa.get_communalities()

        feat = self.extract_features()

        # Dictionary to hold the correct feature name to the number
        mag_dict = {}
        for t, f in enumerate(feat):
            mag_dict[f] = magnitude[t]

        sorted_mag = sorted(mag_dict.items(),
                            key=lambda kv: kv[1],
                            reverse=True)

        # Writes the output of factor analysis to a file
        if write == True:
            factors = pd.DataFrame(sorted_mag,
                                   columns=['Feature', 'Importance'])
            factors.to_csv(self.fa_file, index=False)

        return inp, sorted_mag