def PCA_text(df2, ndims, column, use_tfidf=True, params=None): df = df2.copy() if use_tfidf: bow = CountVectorizer(params).fit_transform(df[column]) else: bow = CountVectorizer(params).fit_transform(df[column]) pca_bow = PCA(ndims, random_state=seed).fit_transform(bow) pca_bow = pd.DataFrame(pca_bow) pca_bow.columns = ['PCA_dim{}_{}'.format(x, column) for x in range(pca_bow.shape[1])] return pca_bow
def DBSCAN1(self, target: str): # self.data = self.outlier_analysis(self.data, 'IQR') y: np.ndarray = self.data.pop(target).values X: np.ndarray = self.data.values # Scaling the data to bring all the attributes to a comparable level scaler = StandardScaler() X_norm = scaler.fit_transform(X) # Normalizing the data so that # the data approximately follows a Gaussian distribution # PCA PCA(n_components=2).fit(X) X_PCA = PCA(n_components=2).fit_transform(X, y) X_PCA = pd.DataFrame(X_PCA) X_PCA.columns = ['P1', 'P2'] plt.figure(figsize=(8, 8)) plt.title('Visualising the data') Dendrogram = shc.dendrogram((shc.linkage(X_PCA, method='ward'))) plt.show() exit() db = DBSCAN(eps=300, min_samples=50).fit(X_PCA) y_true = DBSCAN(eps=300, min_samples=50).fit_predict(X_PCA) labels = db.labels_ # Building the label to colour mapping colours = {0: 'r', 1: 'g', 2: 'b', -1: 'k'} # Building the colour vector for each data point cvec = [colours[label] for label in labels] # For the construction of the legend of the plot r = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='r') g = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='g') b = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='b') k = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='k') # Plotting P1 on the X-Axis and P2 on the Y-Axis # according to the colour vector defined plt.figure(figsize=(9, 9)) plt.scatter(X_PCA['P1'], X_PCA['P2'], c=cvec) # Building the legend plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1')) plt.show() print("Silhouette[PCA] =", silhouette_score(X_PCA, y_true, metric='euclidean')) print("RI[PCA] =", adjusted_rand_score(y, y_true))
def calculate_pc(df, num_pc=None): # type: (pd.DataFrame, int) -> pd.DataFrame """ Calculate principal components for a Pandas data frame :param df: Pandas data frame holding the data to be analysed :param num_pc: Number of principal components to return. Default is the same amount of attributes as the input data :return: Pandas data frame holding the principal components values in descending order """ pc = PCA().fit_transform(scale(df)) pc = pd.DataFrame(pc) pc.columns = ['_'.join(['pc', str(i)]) for i in range(1, len(pc.columns) + 1)] if num_pc: pc = pc.drop(pc.columns[list(range(int(num_pc), len(pc.columns)))], axis=1) return pc
def save_tables_pca(table): """ Runs PCA - saves one table (filtered) with col names, and one without. Keep components that cumulatively describe at least 80% of variance """ table_column_names = table.columns norm_data = StandardScaler().fit_transform(table) pca = PCA().fit(norm_data) explained_variance = np.cumsum(pca.explained_variance_ratio_) threshold_variance = 0.8 max_index = np.where(explained_variance > threshold_variance)[0][0] components = pd.DataFrame(pca.components_) components.columns = table_column_names components.to_csv(snakemake.params.components, index=False, sep='\t') pca = PCA().fit_transform(norm_data) pca = pca[:, 0:max_index + 1] np.savetxt(snakemake.output.normalise, pca, delimiter='\t') pca = pd.DataFrame(pca) pca.columns = ['PC_%s' % i for i in range(0, pca.shape[1])] pca.to_csv(snakemake.output.filtered, index=False, sep='\t')
#applying PCA to the model from sklearn.decomposition import PCA pca = PCA() pc = pca.fit_transform(df_std) pc_df = pd.DataFrame(pc) print(pca.explained_variance_ratio_) from sklearn.decomposition import PCA pca = PCA(n_components=2) pc = pca.fit_transform(df_std) pc_df = pd.DataFrame(pc) print(pc_df.head()) pca = pd.concat([pc_df,data_final['clusters']],axis=1) pca.columns = ['pc1','pc2','clusters'] print(pca.shape) print(pca.head()) print(pca.clusters.value_counts()) plt.figure(figsize=(12,6)) sns.scatterplot(x='pc1',y='pc2',hue= 'clusters', data=pca,palette='Set1') plt.show() data_final.groupby('clusters').Fresh.mean().plot(kind='bar') plt.show() data_final.groupby('clusters').Milk.mean().plot(kind='bar') plt.show() data_final.groupby('clusters').Grocery.mean().plot(kind='bar') plt.show() data_final.groupby('clusters').Frozen.mean().plot(kind='bar')
index = np.arange(len(features)) fpg1 = np.transpose(fit.components_[0]) fpg2 = np.transpose(fit.components_[1]) plt.bar(index, fpg1, bar_width, color='b', alpha=opacity) plt.bar(index + bar_width, fpg2, bar_width, color='g', alpha=opacity) plt.xticks(index + bar_width, features, rotation=90) plt.margins(x=0) t0 = time() Y = PCA(n_components).fit_transform(X) t1 = time() print("PCA: %.2g sec" % (t1 - t0)) #ax = fig.add_subplot(151) #plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=myCmap,label=classes,alpha=0.5, edgecolors='none') Y = pd.DataFrame(Y) Y.columns = ['x', 'y'] Y = pd.concat([Y, color], axis=1) Y = pd.concat([Y, y_pred], axis=1) myp = sns.lmplot(data=Y, x='x', y='y', hue='label', palette="Set1", markers=["o", "x"], fit_reg=False, legend=True, legend_out=True) new_title = 'Normal' myp._legend.set_title(new_title) plt.title("PCA (%.2g sec)" % (t1 - t0)) #ax.xaxis.set_major_formatter(NullFormatter())
# print(X) # # Standard scale data # X = StandardScaler().fit(X).transform(X) # print ("----------------------Scaled data----------------------") # print(X) # # Robust scale data # X = RobustScaler().fit(X).transform(X) # print ("----------------------Scaled data----------------------") # print(X) # Reduce dimensionality & visualizable X = PCA(n_components=2).fit_transform(X) X = pd.DataFrame(X) X.columns = ['P1', 'P2'] # DBSCAN eps = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5] min_samples = [3, 5, 10, 15, 20, 30, 50, 100] metrics = ["euclidean", "hamming"] db_default = DBSCAN(eps=0.001, min_samples=100, metric="euclidean").fit(X) labels = db_default.labels_ print("eps={} min_samples={} metric={} purity = {}".format( EPS, SAMPLES, METRIC, purity(labels, y))) # Plot result import matplotlib matplotlib.use('TkAgg') import matplotlib.pyplot as plt