# ### Biplot # # A scatterplot projected onto the first two principal components. # In[10]: plt.figure() data_scaled = pd.DataFrame(_scaled, columns=df.columns) triplot(pca, data_scaled, title='ANES 2012 Biplot', color=data_scaled.PartyID) # In[11]: biplot(pca, data_scaled, title='ANES 2012 Biplot', color=data_scaled.PartyID) # Sure, all of the original axes are negative in the first component. That's okay! To quote Dr. Eric Larson: # > Because all the data is somewhat correlated, giving a mostly unidimensional representation. Positive/negative isn't so important because eigenvectors could theoretically start anywhere--but traditionally we use the origin. # # **Update:** The demographic factor of education level has a different sign from the others. # In[12]: def fpc_ordered(corr): """Reorder correlation matrix based on first principal component (FPC).""" ew, ev = np.linalg.eig(corr) idx = np.argsort(ew)[::-1] # Reordering index of eigenvalues ew, ev = ew[idx], ev[:, idx] e1 = ev[:, 0]
# ### Biplot # # A scatterplot projected onto the first two principal components. # In[12]: data_scaled = pd.DataFrame(_scaled, columns=df.columns) triplot(pca, data_scaled, title='ANES {} Biplot'.format(YEAR), color=data_scaled.PartyID) # In[13]: biplot(pca, data_scaled, title='ANES {} Biplot'.format(YEAR), color=data_scaled.PartyID) # In[14]: pca.explained_variance_ # ## Dropping na # In[15]: df2 = df.dropna() #imp = Imputer(strategy='mean') scl = StandardScaler() pca = PCA() pipeline = Pipeline([
for df in DATA_FRAMES[1:] ] scaled = [ scaler_pipeline.transform(df[VARIABLES_CONSISTENT_ACROSS_ALL_YEARS]) for df in DATA_FRAMES[1:] ] scaled = [ pd.DataFrame(arr, columns=VARIABLES_CONSISTENT_ACROSS_ALL_YEARS) for arr in scaled ] # In[9]: for df, year in zip(scaled, YEARS[1:]): biplot(pca, df, title="{} Survey on {} Axes".format(year, YEARS[1]), color=df.PartyID) # In[10]: pipeline = Pipeline([ # ('imp', imp), ('scl', scl), ('pca', pca), ]) scaler_pipeline = Pipeline([ # ('imp', imp), ('scl', scl), ]) evrs = []