示例#1
0
def PCA_text(df2, ndims, column, use_tfidf=True, params=None):
    df = df2.copy()
    if use_tfidf:
        bow = CountVectorizer(params).fit_transform(df[column])
    else:
        bow = CountVectorizer(params).fit_transform(df[column])
    pca_bow = PCA(ndims, random_state=seed).fit_transform(bow)
    pca_bow = pd.DataFrame(pca_bow)
    pca_bow.columns = ['PCA_dim{}_{}'.format(x, column) for x in range(pca_bow.shape[1])]
    return pca_bow
示例#2
0
    def DBSCAN1(self, target: str):
        # self.data = self.outlier_analysis(self.data, 'IQR')
        y: np.ndarray = self.data.pop(target).values
        X: np.ndarray = self.data.values

        # Scaling the data to bring all the attributes to a comparable level
        scaler = StandardScaler()
        X_norm = scaler.fit_transform(X)

        # Normalizing the data so that
        # the data approximately follows a Gaussian distribution

        # PCA
        PCA(n_components=2).fit(X)
        X_PCA = PCA(n_components=2).fit_transform(X, y)
        X_PCA = pd.DataFrame(X_PCA)
        X_PCA.columns = ['P1', 'P2']

        plt.figure(figsize=(8, 8))
        plt.title('Visualising the data')
        Dendrogram = shc.dendrogram((shc.linkage(X_PCA, method='ward')))
        plt.show()
        exit()

        db = DBSCAN(eps=300, min_samples=50).fit(X_PCA)
        y_true = DBSCAN(eps=300, min_samples=50).fit_predict(X_PCA)
        labels = db.labels_

        # Building the label to colour mapping
        colours = {0: 'r', 1: 'g', 2: 'b', -1: 'k'}

        # Building the colour vector for each data point
        cvec = [colours[label] for label in labels]

        # For the construction of the legend of the plot
        r = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='r')
        g = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='g')
        b = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='b')
        k = plt.scatter(X_PCA['P1'], X_PCA['P2'], color='k')

        # Plotting P1 on the X-Axis and P2 on the Y-Axis
        # according to the colour vector defined
        plt.figure(figsize=(9, 9))
        plt.scatter(X_PCA['P1'], X_PCA['P2'], c=cvec)

        # Building the legend
        plt.legend((r, g, b, k), ('Label 0', 'Label 1', 'Label 2', 'Label -1'))
        plt.show()

        print("Silhouette[PCA] =",
              silhouette_score(X_PCA, y_true, metric='euclidean'))
        print("RI[PCA] =", adjusted_rand_score(y, y_true))
示例#3
0
def calculate_pc(df,  num_pc=None):
    # type: (pd.DataFrame, int) -> pd.DataFrame
    """
    Calculate principal components for a Pandas data frame

    :param df: Pandas data frame holding the data to be analysed
    :param num_pc: Number of principal components to return. Default is the same amount of attributes as the input data
    :return: Pandas data frame holding the principal components values in descending order
    """
    pc = PCA().fit_transform(scale(df))
    pc = pd.DataFrame(pc)
    pc.columns = ['_'.join(['pc', str(i)]) for i in range(1, len(pc.columns) + 1)]
    if num_pc:
        pc = pc.drop(pc.columns[list(range(int(num_pc), len(pc.columns)))], axis=1)
    return pc
示例#4
0
def save_tables_pca(table):
    """ Runs PCA - saves one table (filtered) with col names, and one without.
     Keep components that cumulatively describe at least 80% of variance """
    table_column_names = table.columns

    norm_data = StandardScaler().fit_transform(table)
    pca = PCA().fit(norm_data)
    explained_variance = np.cumsum(pca.explained_variance_ratio_)
    threshold_variance = 0.8
    max_index = np.where(explained_variance > threshold_variance)[0][0]

    components = pd.DataFrame(pca.components_)
    components.columns = table_column_names
    components.to_csv(snakemake.params.components, index=False, sep='\t')

    pca = PCA().fit_transform(norm_data)
    pca = pca[:, 0:max_index + 1]
    np.savetxt(snakemake.output.normalise, pca, delimiter='\t')

    pca = pd.DataFrame(pca)
    pca.columns = ['PC_%s' % i for i in range(0, pca.shape[1])]
    pca.to_csv(snakemake.output.filtered, index=False, sep='\t')
示例#5
0
#applying PCA to the model
from sklearn.decomposition import PCA
pca = PCA()
pc = pca.fit_transform(df_std)
pc_df = pd.DataFrame(pc)
print(pca.explained_variance_ratio_)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pc = pca.fit_transform(df_std)
pc_df = pd.DataFrame(pc)
print(pc_df.head())

pca = pd.concat([pc_df,data_final['clusters']],axis=1)
pca.columns = ['pc1','pc2','clusters']
print(pca.shape)
print(pca.head())
print(pca.clusters.value_counts())

plt.figure(figsize=(12,6))
sns.scatterplot(x='pc1',y='pc2',hue= 'clusters', data=pca,palette='Set1')
plt.show()

data_final.groupby('clusters').Fresh.mean().plot(kind='bar')
plt.show()
data_final.groupby('clusters').Milk.mean().plot(kind='bar')
plt.show()
data_final.groupby('clusters').Grocery.mean().plot(kind='bar')
plt.show()
data_final.groupby('clusters').Frozen.mean().plot(kind='bar')
示例#6
0
index = np.arange(len(features))
fpg1 = np.transpose(fit.components_[0])
fpg2 = np.transpose(fit.components_[1])
plt.bar(index, fpg1, bar_width, color='b', alpha=opacity)
plt.bar(index + bar_width, fpg2, bar_width, color='g', alpha=opacity)
plt.xticks(index + bar_width, features, rotation=90)
plt.margins(x=0)

t0 = time()
Y = PCA(n_components).fit_transform(X)
t1 = time()
print("PCA: %.2g sec" % (t1 - t0))
#ax = fig.add_subplot(151)
#plt.scatter(Y[:, 0], Y[:, 1], c=color, cmap=myCmap,label=classes,alpha=0.5, edgecolors='none')
Y = pd.DataFrame(Y)
Y.columns = ['x', 'y']
Y = pd.concat([Y, color], axis=1)
Y = pd.concat([Y, y_pred], axis=1)
myp = sns.lmplot(data=Y,
                 x='x',
                 y='y',
                 hue='label',
                 palette="Set1",
                 markers=["o", "x"],
                 fit_reg=False,
                 legend=True,
                 legend_out=True)
new_title = 'Normal'
myp._legend.set_title(new_title)
plt.title("PCA (%.2g sec)" % (t1 - t0))
#ax.xaxis.set_major_formatter(NullFormatter())
示例#7
0
# print(X)

# # Standard scale data
# X = StandardScaler().fit(X).transform(X)
# print ("----------------------Scaled data----------------------")
# print(X)

# # Robust scale data
# X = RobustScaler().fit(X).transform(X)
# print ("----------------------Scaled data----------------------")
# print(X)

# Reduce dimensionality & visualizable
X = PCA(n_components=2).fit_transform(X)
X = pd.DataFrame(X)
X.columns = ['P1', 'P2']

# DBSCAN
eps = [0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5]
min_samples = [3, 5, 10, 15, 20, 30, 50, 100]
metrics = ["euclidean", "hamming"]

db_default = DBSCAN(eps=0.001, min_samples=100, metric="euclidean").fit(X)
labels = db_default.labels_
print("eps={} min_samples={} metric={} purity = {}".format(
    EPS, SAMPLES, METRIC, purity(labels, y)))

# Plot result
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt