def the_fa(all_measures_, n_factors=-1):
    fa_ = FactorAnalyzer()
    if n_factors < 0:
        fa_.analyze(all_measures_, len(all_measures_.columns), rotation='promax')
    else:
        fa_.analyze(all_measures_, n_factors, rotation='promax')
    return fa_
예제 #2
0
def calculate_py_output(test_name,
                        factors,
                        method,
                        rotation,
                        top_dir=None):
    """
    Use the `FactorAnalyzer()` class to perform the factor analysis
    and return a dictionary with relevant output for given scenario.

    Parameters
    ----------
    test_name : str
        The name of the test
    factors : int
        The number of factors
    method : str
        The rotation method
    rotation : str
        The type of rotation
    top_dir : str, optional
        The top directory for test data
        Defaults to `DATA_DIR``

    Returns
    -------
    output : dict
        A dictionary containing the outputs
        for all `OUTPUT_TYPES`.
    """
    if top_dir is None:
        top_dir = DATA_DIR

    filename = join(top_dir, test_name + '.csv')
    data = pd.read_csv(filename)

    rotation = None if rotation == 'none' else rotation
    method = {'uls': 'minres'}.get(method, method)

    fa = FactorAnalyzer()
    fa.analyze(data, factors, method=method, rotation=rotation)

    evalues, values = fa.get_eigenvalues()

    return {'value': values,
            'evalues': evalues,
            'structure': fa.structure,
            'loading': fa.loadings,
            'uniquenesses': fa.get_uniqueness(),
            'communalities': fa.get_communalities(),
            'scores': fa.get_scores(data)}
예제 #3
0
    def run(self, dfx, n_factors=3):

        self.n_factors = n_factors

        msg = {}

        x_numer_cols, x_cate_cols = ParseDFtypes(dfx)

        if x_numer_cols == []:
            logging.error(
                'All input dfx are no numeric columns, Please check your input dfx data!'
            )
            msg['error'] = 'All input dfx are no numeric columns, Please check your input dfx data!'
            return {'result': pd.DataFrame(), 'msg': msg}

        else:

            if x_cate_cols != []:
                logging.warning(
                    'input dfx has non-numeric columns: %s, will ignore these columns!'
                    % x_cate_cols)

                msg['warning'] = 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols

        dfu = dfx[x_numer_cols]

        fa = FactorAnalyzer()
        fa.analyze(dfu, n_factors, rotation=None)
        l = fa.loadings
        c = fa.get_communalities()
        s = fa.get_scores(dfu)

        l.columns = ['因子%s荷载系数' % (i + 1) for i in range(n_factors)]
        c.columns = ['共同度']
        s.columns = ['因子%s' % (i + 1) for i in range(n_factors)]

        res = l.join(c)

        return {'result': res, 'msg': msg, 'factor': s}
hospital_reduct_fac = hospital_data[[
    'HospitalID', 'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp',
    'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds',
    'Compensation', 'MaxTerm'
]]

##Method1: using FactorAnalysis from sklearn
fact_result = fact(n_components=10).fit(hospital_reduct_fac)
fact_result.components_
print(pd.DataFrame(fact_result.components_, hospital_reduct_fac.columns))

##Method2: using FactorAnalyzer from factor_analyzer
from factor_analyzer import FactorAnalyzer

fa = FactorAnalyzer()
fa.analyze(hospital_reduct_fac, 10, rotation='varimax')
fa.loadings

#k-means clutter analysis for all numerical data
#Look at unique values of categorical variables
hospital_data.Teaching.unique()
hospital_data.DonorType.unique()
hospital_data.Gender.unique()
hospital_data.TypeControl.unique()
hospital_data.PositionTitle.unique()

#K-Means, 2 clusters
km = cls.KMeans(n_clusters=2).fit(hospital_data.loc[:, [
    'FullTimeCount', 'NetPatientRevenue', 'InpatientOperExp',
    'OutpatientOperExp', 'Operating_Revenue', 'Operating_Income', 'AvlBeds',
    'Compensation', 'MaxTerm'
예제 #5
0
print("After imputing with the single PERMA-score we have",
      survey_df['PERMA'].isnull().sum(), "missing values")

survey_df = survey_df.drop(['index'], axis=1)

# remove duplicated user_id
survey_df = survey_df[~survey_df['insta_user_id'].duplicated(keep='first')]

#%%

from factor_analyzer import FactorAnalyzer

fa = FactorAnalyzer()
fa_features = survey_df[['P', 'E', 'R', 'M', 'A']]
fa.analyze(fa_features, 2,
           rotation=None)  # No rotation = no correlation between factors
ev, v = fa.get_eigenvalues()

ev  # Eigenvalue drops below
v
fa.loadings

#%%

# Factor analysis to check if all five variables load on the same latent construct
# Construct argument from this: no need to look at the questions when we have FA
# The all load on the same latent construct
# However, when using the orthogonal rotation...

fa = FactorAnalyzer()
fa.analyze(fa_features, 2,
# In[16]:


kmo_model


# In[17]:


fa = FactorAnalyzer()


# In[18]:


fa.analyze(df, 25, rotation=None)


# In[19]:


fa.analyze(df, 15, rotation=None)


# In[20]:


ev, v = fa.get_eigenvalues()


# In[21]:
예제 #7
0
         'movment_reason_1st_and_2nd', 'referrer', 'triage_scale',
         'discharge_from_ed', 'hospitalization',
         'receivement_approvement_of_first_sampling', 'ed_record_creation_date',
         'ed_record_creation_hour', 'hospitalization_department',
         'planned_transfer_date', 'planned_transfer_hour',
         'minutes_from_admittance_to_hospitalization_decision',
         'minutes_from_decision_to_arrival_at_hospitalization_department',
         'summary', 'patient_condition_in_release', 'treatment_recommendation',
         'physical_condition', 'eeg', 'registration_datetime'], axis=1, inplace=True)
chi_square_value, p_value = calculate_bartlett_sphericity(dm)

# dn.drop(['last_results_document_creation_hour','registration_datetime'],axis=1, inplace=True)


fa = FactorAnalyzer()
fa.analyze(dm, 25, rotation=None)


# begin example
x = np.random.randn(1000)
hist_data = [x]
group_labels = ['distplot']

fig = ff.create_distplot(hist_data, group_labels)
py.iplot(fig, filename='Basic Distplot')
# End Example

data = [go.Scatter(x=df['registration_datetime'], y=df['visits_within_hour'])]
# data = [go.Scatter(x=dff['registration_datetime'], y=dff['visits_within_hour'])]

py.iplot(data, filename='time-series-simple')
예제 #8
0
from scipy.optimize import minimize
from scipy.spatial import distance

# raw data
url = 'https://raw.githubusercontent.com/rkn2/factorAnalysisExample/master/bfi%20(1).csv'
df = pd.read_csv(url)
df.columns
unnecessaryColumns = ['gender', 'age', 'education']
df.drop(unnecessaryColumns, axis=1, inplace=True)
df.dropna(inplace=True)
numVars = df.shape[1] - len(unnecessaryColumns)

# regular fa
fa = FactorAnalyzer()
numFactors = 5
fa.analyze(df, numFactors, rotation=None)
L = np.array(fa.loadings)
headings = list(fa.loadings.transpose().keys())
factor_threshold = 0.25
for i, factor in enumerate(L.transpose()):
    descending = np.argsort(np.abs(factor))[::-1]
    contributions = [(np.round(factor[x], 2), headings[x]) for x in descending
                     if np.abs(factor[x]) > factor_threshold]
    print('Factor %d:' % (i + 1), contributions)

# pre computed correlation matrix fa
fa = FactorAnalyzer()
numFactors = 5
x = (df - df.mean(0)) / df.std(0)
corr = np.cov(x, rowvar=False, ddof=0)
# In[19]:

# top most variables based on the forward feature selection algorithm.
variable

# ## Factor Analysis

# In[20]:

from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer()

# In[28]:

fa.analyze(train, 3, rotation=None)

# In[29]:

fa.loadings

# In[30]:

fa.get_uniqueness()
# we have to select which feature have the heighest uniqueness value that feature is the first importent variable

# ## Principle component Analysis

# In[31]:

from sklearn.decomposition import PCA
예제 #10
0
#Test VIF2 after removing columns having high VIF
X1 = sm.tools.add_constant(data)
VIF = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
print('-'*100)
print('Variance Inflation Factor.......')
print('-'*100)
print(VIF)
time.sleep(3)


#Performing factor analysis
print('-'*100)
print('Eigen values')
print('-'*100)
fa = FactorAnalyzer()
fa.analyze(data, rotation="varimax")
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
print(ev)
print('-'*100)
print(fa.loadings)
print('-'*100)
print(fa.get_factor_variance())

time.sleep(6)

g=[data.columns]
g=np.array(g).tolist()
g=g[0]
l=[]
for i in g:
예제 #11
0
                      [46, 44, 47, 39, 37], [77, 61, 48, 48, 67],
                      [49, 55, 57, 48, 53], [48, 44, 42, 46, 60],
                      [40, 38, 45, 49, 34], [36, 36, 44, 47, 47],
                      [54, 50, 50, 45, 46], [52, 47, 61, 66, 46],
                      [40, 52, 36, 47, 46], [63, 28, 35, 42, 48],
                      [44, 33, 49, 20, 29], [46, 59, 50, 53, 57],
                      [51, 41, 60, 59, 63], [45, 39, 48, 46, 45],
                      [34, 39, 43, 50, 40], [34, 29, 45, 44, 48],
                      [57, 46, 54, 46, 42], [38, 42, 41, 36, 41],
                      [43, 47, 41, 53, 44], [45, 51, 53, 46, 53],
                      [49, 56, 54, 61, 51], [35, 38, 57, 65, 57]])
seiseki_in = pd.DataFrame(seiseki_a, columns=subject)
seiseki = pd.DataFrame(scale(seiseki_in), columns=seiseki_in.columns.values)

fa = FactorAnalyzer()
fa.analyze(seiseki, 2, rotation="varimax")
#fa.analyze(seiseki, 2, rotation="promax")
#fa.analyze(seiseki, 2, rotation=None)

print('相関行列\n', seiseki.corr(method='pearson'))
print()
print('因子負荷量', fa.loadings.round(4))  # loadings
print()
print('独自性', fa.get_uniqueness().round(4))  # uniqueness
print()
print('因子分散', fa.get_factor_variance().round(4))
print()

##################
#寄与率
kiyo = np.array([0, 0])
# %%
np.sum(clf.predict(test_x))

# %%
df = df_2017_c[all_columns]
from factor_analyzer import FactorAnalyzer
fa = FactorAnalyzer()
fa.fit(df)
ev, v = fa.get_eigenvalues()
ev
# %%
plt.scatter(range(1,df.shape[1]+1),ev)
plt.plot(range(1,df.shape[1]+1),ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
# %%
fa = FactorAnalyzer()
fa.analyze(df, 3, rotation="varimax")

#%%
plt.hist(df_2017_c['PHQ score'], bins='auto')
# %%
"""Shot, Dead End, Try Bayesian Inference."""

"""
For example, there's smoke or not. and for smoker, the prob of depression is less than 4%, for non-smoker, less than 4% as well (edited) 
so, no matter what x input, it will be better to guess NO.
"""
예제 #13
0
def factor_analysis(dist_df, numFactors=6, prohibited_layers=[]):
    df = copy.deepcopy(dist_df)
    # regular fa
    fa = FactorAnalyzer()
    #
    df.drop([l for l in prohibited_layers if l in df.columns],
            axis=1,
            inplace=True)
    df.dropna(inplace=True)
    df[np.isinf(df)] = 1e12
    #
    df_std = np.std(df, axis=0)
    valid_std = sorted([c for c in df.columns if df_std[c] > 0])
    #
    fa.analyze(df[valid_std], numFactors, rotation='varimax')
    L = np.array(fa.loadings)
    headings = list(fa.loadings.transpose().keys())
    factor_threshold = 0.4
    factors = []
    for i, factor in enumerate(L.transpose()):
        descending = np.argsort(np.abs(factor))[::-1]
        contributions = [(np.round(factor[x], 2), headings[x])
                         for x in descending
                         if np.abs(factor[x]) > factor_threshold]
        factors.append(contributions)
        print('Factor %d:' % (i + 1), contributions)

    inv = False
    stacked_bars = False
    h = len(fa.loadings) / 5
    fig, ax = plt.subplots(1, 1, figsize=(6, h))
    c = np.zeros(L.shape[0])
    for i, factor in enumerate(fa.loadings.columns):
        if not stacked_bars:
            important_features = np.argsort(
                1 - np.abs(fa.loadings[:][factor].values))
            edgecolor = [
                'k' if i in important_features[:10] else 'none'
                for i, e in enumerate(important_features)
            ]
            line_width = [
                1.5 if i in important_features[:10] else 0
                for i, e in enumerate(important_features)
            ]
            if inv:
                data = np.linalg.pinv(fa.loadings).T[:, i]
            else:
                data = fa.loadings[:][factor]
            ax.barh(fa.loadings.T.columns,
                    data,
                    left=c,
                    ec=edgecolor,
                    lw=line_width)
            c += data.max() - data.min()
        else:
            ax.barh(fa.loadings.T.columns,
                    np.abs(fa.loadings[:][factor]),
                    left=c)
            c += np.abs(fa.loadings[:][factor])
        # if stacked_bars:
        #     c += np.abs(fa.loadings[:][factor]).max()
        # else:
        #     c += np.abs(fa.loadings[:][factor])
    ax.tick_params(
        axis='x',  # changes apply to the x-axis
        which='both',  # both major and minor ticks are affected
        bottom=False,  # ticks along the bottom edge are off
        top=False,  # ticks along the top edge are off
        labelbottom=False)  # labels along the bottom edge are off
    fig.tight_layout()
    fig.savefig(
        os.path.join(cad_path,
                     'FactorAnalysis_%d_%s.pdf' % (numFactors, stacked_bars)))
    plt.close(fig)

    if plot_wall:
        # make predictions and plot them
        x_orig = dist_df['X_ORIG']
        y_orig = dist_df['Y_ORIG']
        for i, factor in enumerate(fa.loadings.columns):
            Z = np.zeros(len(df))
            inv_loadings = np.linalg.pinv(fa.loadings)
            # inv_loadings = np.array(fa.loadings).T
            for j, col in enumerate(fa.loadings.T.columns):
                Z += inv_loadings[i, j] * (df[col] -
                                           np.mean(df[col])) / np.std(df[col])
            #
            mag = np.median(np.abs(Z)) * 5
            #
            fig, ax = plt.subplots(4, 1, figsize=(10, 10))
            for w in range(4):
                w_idx = df['WALL_POSITION'] == w
                plot_gt_2d(
                    x_orig[w_idx],
                    y_orig[w_idx],
                    Z[w_idx],
                    factor,
                    directions[w],
                    bbox[w],
                    cmap='RdBu_r',
                    ms=1,
                    lw=0.5,
                    ax=ax[w],
                )
                # vmin=-mag, vmax=mag)
            fig_name = '%stotal_%s.png' % (numFactors, factor)
            fig.savefig(os.path.join(cad_path, fig_name), dpi=150)
            plt.close(fig)

    factor_list = []
    weight_list = []
    for i, factor in enumerate(factors):
        factor_list.append([])
        weight_list.append([])
        for j, condition in enumerate(factor):
            factor_list[i].append(condition[1])
            weight_list[i].append(condition[0])
예제 #14
0
파일: list8-7.py 프로젝트: luka3117/toy
    plt.ylim(min(coeff.iloc[:, pca2].min() - 0.1, -1.1),
             max(coeff.iloc[:, pca2].max() + 0.1, 1.1))
    plt.xlabel("F{}".format(pcax))
    plt.ylabel("F{}".format(pcay))
    plt.grid()
    plt.show()


dset = datasets.load_boston()
boston = pd.DataFrame(dset.data)
boston.columns = dset.feature_names
target = pd.DataFrame(dset.target)
boston = pd.DataFrame(scale(boston), columns=boston.columns)

fa = FactorAnalyzer()
fa.analyze(boston, 2, rotation="varimax")  # varimax回転をする場合
#fa.analyze(boston, 2, rotation="promax")  # promax回転をする場合
#fa.analyze(boston, 2, rotation=None)      # 回転をしない場合
#fa.analyze(boston, 7, rotation="varimax") # scree plotの時に7因子まで算出

print('相関行列\n', boston.corr(method='pearson').round(4))
print()
print('因子負荷量', fa.loadings.round(4))  # loadings
print()
print('独自性', fa.get_uniqueness().round(4))  # uniqueness
print()
print('因子分散', fa.get_factor_variance().round(4))
print()


#################
예제 #15
0
from sklearn.preprocessing import StandardScaler
from factor_analyzer import FactorAnalyzer
import pandas as pd
import factor_analyzer

air = pd.read_csv('../initial/data_pollution/lon_lsoa_pollution_all.csv')
air.head()

# take just indicators and standardise
vals = air.iloc[:,1:].values

ss = StandardScaler()
air_s = pd.DataFrame(ss.fit_transform(vals), columns = air.columns[1:])

air_s.describe()

# run factor analysis

fa = FactorAnalyzer()
fa.analyze(air_s, 4, method = 'principal', rotation = None)

fa.loadings.to_csv('pca_results_air_.csv')

예제 #16
0
# take kn-10 shrinkage results health data
health_raw = health_raw[(health_raw['METHOD'] == 'KN-10')]
health_raw = health_raw[['LSOA11CD', 'INDICATOR_GROUP_CODE', 'rate']]

# pivot to wide table
health = health_raw.pivot(index='LSOA11CD',
                          columns='INDICATOR_GROUP_CODE',
                          values='rate')

health.columns.name = None
health = health.reset_index()

# check correct no. of LSOAs
print(len(health))
health.head()

# take just indicators and standardise
cols = ['DEM', 'DEP', 'CVDPP', 'OB']

vals = health[cols].values
ss = StandardScaler()
health_s = pd.DataFrame(ss.fit_transform(vals), columns=cols)
health_s.head()

# run factor analysis

fa = FactorAnalyzer()
fa.analyze(health_s, 4, method='principal', rotation='varimax')

fa.loadings.to_csv('pca_results_health.csv')