示例#1
0
def plotWords():
    # get model, we use w2v only
    w2v, d2v = gensim.models.Doc2Vec.load_word2vec_format(
        "C:/Users/ghfiy/PycharmProjects/TwitterProcess/trained.word2vec")
    words_np = []
    # a list of labels (words)
    words_label = []
    for word in w2v.vocab.keys():
        words_np.append(w2v[word])
        words_label.append(word)
    print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np)))

    pca = PCA(n_components=2)
    pca.fit(words_np)
    reduced = pca.transform(words_np)

    # plt.plot(pca.explained_variance_ratio_)
    for index, vec in enumerate(reduced):
        # print ('%s %s'%(words_label[index],vec))
        if index < 100:
            x, y = vec[0], vec[1]
            plt.scatter(x, y)
            plt.annotate(words_label[index], xy=(x, y))
    plt.show()
    plt.plot()
示例#2
0
    def main():
        print("add dataset into numpy array")
        train_dataset = append_feature(TRAIN_PATH)
        print("train set created successfully")
        test_dataset = append_feature(TEST_PATH)
        print("train set created successfully")

        n_samples, h, w = train_dataset.images.shape

        # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
        # X_train, X_test, y_train, y_test = train_test_split(image_dataset.data, image_dataset.target, test_size=0.1)
        X_train = train_dataset.data
        y_train = train_dataset.target

        X_test = test_dataset.data
        y_test = test_dataset.target

        # print(y_train)
        # print(y_test)

        n_components = 70
        pca = PCA(n_components=n_components).fit(X_train)
        eigenfaces = pca.components_.reshape((n_components, h, w))

        print("Projecting the input data on the eigenfaces orthonormal basis")
        X_train_pca = pca.transform(X_train)
        X_test_pca = pca.transform(X_test)

        eigenface_titles = [
            "eigenface %d" % i for i in range(eigenfaces.shape[0])
        ]
        # print(eigenfaces.shape[0])
        plot_gallery(eigenfaces, eigenface_titles, h, w)
        # plt.imshow(eigenfaces.shape[0])
        plt.show()

        k = 2
        knn_model = KNeighborsClassifier(n_neighbors=k)
        model_save = knn_model.fit(X_train_pca, y_train)
        saved_model = pickle.dumps(model_save)
        knn_from_pickle = pickle.loads(saved_model)

        # print(model_save)

        y_predict = knn_from_pickle.predict(X_test_pca)
        print(classification_report(y_test, y_predict))
示例#3
0
    def pca(self):
        if (self.inputDataUji.toPlainText() != ''):
            print("add dataset into numpy array")
            train_dataset = append_feature(TRAIN_PATH)
            print("train set created successfully")
            test_dataset = append_feature(TEST_PATH)
            print("train set created successfully")

            n_samples, h, w = train_dataset.images.shape

            X_train = train_dataset.data
            y_train = train_dataset.target

            X_test = test_dataset.data
            y_test = test_dataset.target

            n_components = 70
            pca = PCA(n_components=n_components).fit(X_train)
            eigenfaces = pca.components_.reshape((n_components, h, w))

            print(
                "Projecting the input data on the eigenfaces orthonormal basis"
            )
            X_train_pca = pca.transform(X_train)
            X_test_pca = pca.transform(X_test)

            eigenface_titles = [
                "eigenface %d" % i for i in range(eigenfaces.shape[0])
            ]
            plot_gallery(eigenfaces, eigenface_titles, h, w)
            plt.show()

            k = 2
            knn_model = KNeighborsClassifier(n_neighbors=k)
            model_save = knn_model.fit(X_train_pca, y_train)
            saved_model = pickle.dumps(model_save)
            knn_from_pickle = pickle.loads(saved_model)

            # print(model_save)

            y_predict = knn_from_pickle.predict(X_test_pca)
            self.RESULT_CLASSIFICATION = classification_report(
                y_test, y_predict)
示例#4
0
    def draw(self):
        embeddings = self.embedding
        reversed_dictionary = self.doc_mapper.reversed_dictionary
        words_np = []
        words_label = []
        for i in range(0, len(embeddings)):
            words_np.append(embeddings[i])
            words_label.append(reversed_dictionary[i][0])

        pca = PCA(n_components=2)
        pca.fit(words_np)
        reduced = pca.transform(words_np)

        plt.rcParams["figure.figsize"] = (20, 20)
        for index, vec in enumerate(reduced):
            if index < 1000:
                x, y = vec[0], vec[1]
                plt.scatter(x, y)
                plt.annotate(words_label[index], xy=(x, y))
        plt.show()
示例#5
0
    'total_size', 'coupon'
]

data = monthly_cb_value[factor_list]

data = data.fillna(0)
'''
PCA 方法1: 直接用sklearn的包
优势:用SVD降维,更加标准
劣势:由于不知道实际的correlation matrix,不知道该怎么选择component,以及每个component对应的projection 
'''
from sklearn.decomposition import PCA
X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=5)
pca.fit(data)
pca.transform(data)
'''
得到的结果:只有第一个component有意义
pca.explained_variance_ratio_
Out[21]: array([0.94569423, 0.04154984, 0.00570173, 0.00359057, 0.00259915])
'''
'''
PCA 方法2:根据定义自己来进行的PCA
优势:每一步都清楚是怎么做的
劣势:没有采用SVD,只是用correlation matrix来计算
'''
#scale the data(standardize from 0-1)
from sklearn.preprocessing import StandardScaler
factor_std = StandardScaler().fit_transform(data)

#caculating the covariance matrix
示例#6
0
redMatrix = numpy.matrix(totalred)
grnMatrix = numpy.matrix(totalgrn)

# In[ ]:

matrix = numpy.hstack((nirMatrix, redMatrix, grnMatrix))

# In[ ]:

matrix

# In[ ]:

pca = PCA()
pca.fit(matrix)
transform = pca.transform(matrix)

# In[ ]:

transform

# In[ ]:

#Nir
pca1 = transform[:, 0]
zeroNir = pca1 < -.14
oneNir = pca1 > -.13
pca1[zeroNir] = -2000
pca1[oneNir] = 0

#Red (Green Parks)
示例#7
0
    t = sum(i) / float(len(i))
    avg_1.append(t)
dif = 0.0
for i in range(len(data_1[0])):
    dif += (avg_1[i] - avg_2[i])**2
dif = dif**0.5
print("Difference : ", dif)

#print(len(data_1[0])==len(data_2[0]))
# print(len(data_2[0]))
raw_input("Press enter to generate graph")

ipca = PCA(n_components=3)

ipca.fit(data_1)
x_1 = ipca.transform(data_1)

ipca.fit(data_2)
x_2 = ipca.transform(data_2)

Xs = []
Ys = []
Zs = []
for i in x_1[0:50]:
    Xs.append(i[0])
    Ys.append(i[1])
    Zs.append(i[2])

ax.scatter(Xs, Ys, Zs, c='r', marker='o')

Xs = []