data_df.drop(["index", "Date_Time", "Station_ID"], axis=1, inplace=True) model_data = data_df[['air_temp_set_1', 'altimeter_set_1','dew_point_temperature_set_1d',\ 'pressure_set_1d','relative_humidity_set_1', 'sea_level_pressure_set_1d',\ 'wind_speed_set_1']].values #HOGGORM PCA OG HOGGORMPLOT: data_varNames = ['air_temp_set_1', 'altimeter_set_1','dew_point_temperature_set_1d',\ 'pressure_set_1d','relative_humidity_set_1', 'sea_level_pressure_set_1d',\ 'wind_speed_set_1'] data_objNames = list(data_df['Station_ID_NUMERIC']) #For PCA Kan det brukes: loo = cross validation med leave one out #eller cvType=["Kfold", 4] = k-fold cross validation #Datainput standardiseres og centreres ved å sette Xstand=True #Det kalkuleres 7 prinsipale komponenter og data model = ho.nipalsPCA(arrX=model_data, numComp=7, cvType=["loo"], Xstand=True) #Tre figurer lages: 1: Scores-plott, 2: Loadings-plott, 6: Explained varinace-plott #her brukes hoggormplot: hop.plot(model, plots=[1, 2, 6], XvarNames=data_varNames, objNames=data_objNames) #SKLEARN PCA: #Data'en standardiseres ved å bruke StandardScaler() #Det brukes PCA med 7 prinsipale komponenter pipeline = Pipeline([('Scaling', StandardScaler()), ('pca', PCA(n_components=7))]) X_reduced = pipeline.fit_transform(model_data)
#============================================================================== import hoggorm as ho import hoggormplot as hop train_avg_df = train_df.iloc[:, :-1].groupby('replicates').mean() # Get the values from the data frame data = train_avg_df.values # Get the variable or columns names data_varNames = list(train_avg_df.columns) # Get the object or row names data_objNames = list ( map( int, list(train_avg_df.index)) ) model = ho.nipalsPCA(arrX=data, Xstand=False, cvType=["loo"], numComp=4) # In[17]: # %matplotlib qt5 # For zooming in and out get_ipython().run_line_magic('matplotlib', 'inline') hop.plot(model, comp=[1,2], plots=[1,2,3], objNames=data_objNames, XvarNames=data_varNames) # **Components 2 and 3 also showed considerable variation. The points are more densely packed in the direction of 2nd and 3rd components, than the first**
# In[6]: data_objNames # --- # ### Apply PCA to our data # Now, let's run PCA on the data using the ``nipalsPCA`` class. The documentation provides a [description of the input parameters](https://hoggorm.readthedocs.io/en/latest/pca.html). Using input paramter ``arrX`` we define which numpy array we would like to analyse. By setting input parameter ``Xstand=False`` we make sure that the variables are only mean centered, not scaled to unit variance. This is the default setting and actually doesn't need to expressed explicitly. Setting paramter ``cvType=["loo"]`` we make sure that we compute the PCA model using full cross validation. ``"loo"`` means "Leave One Out". By setting paramter ``numpComp=4`` we ask for four principal components (PC) to be computed. # In[7]: model = ho.nipalsPCA(arrX=data, Xstand=False, cvType=["loo"], numComp=4) # That's it, the PCA model has been computed. Now we would like to inspect the results by visualising them. We can do this using the taylor-made plotting function for PCA from the separate [**hoggormPlot** package](https://hoggormplot.readthedocs.io/en/latest/). If we wish to plot the results for component 1 and component 2, we can do this by setting the input argument ``comp=[1, 2]``. The input argument ``plots=[1, 2, 3, 4, 6]`` lets the user define which plots are to be plotted. If this list for example contains value ``1``, the function will generate the scores plot for the model. If the list contains value ``2``, then the loadings plot will be plotted. Value ``3`` stands for correlation loadings plot and value ``4`` stands for bi-plot and ``6`` stands for explained variance plot. The hoggormPlot documentation provides a [description of input paramters](https://hoggormplot.readthedocs.io/en/latest/mainPlot.html). # In[8]: hop.plot(model, comp=[1, 2], plots=[1, 2, 3, 4, 6], objNames=data_objNames, XvarNames=data_varNames) # ---