Пример #1
0
def plotWords():
    # get model, we use w2v only
    w2v, d2v = gensim.models.Doc2Vec.load_word2vec_format(
        "C:/Users/ghfiy/PycharmProjects/TwitterProcess/trained.word2vec")
    words_np = []
    # a list of labels (words)
    words_label = []
    for word in w2v.vocab.keys():
        words_np.append(w2v[word])
        words_label.append(word)
    print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np)))

    pca = PCA(n_components=2)
    pca.fit(words_np)
    reduced = pca.transform(words_np)

    # plt.plot(pca.explained_variance_ratio_)
    for index, vec in enumerate(reduced):
        # print ('%s %s'%(words_label[index],vec))
        if index < 100:
            x, y = vec[0], vec[1]
            plt.scatter(x, y)
            plt.annotate(words_label[index], xy=(x, y))
    plt.show()
    plt.plot()
Пример #2
0
    def draw(self):
        embeddings = self.embedding
        reversed_dictionary = self.doc_mapper.reversed_dictionary
        words_np = []
        words_label = []
        for i in range(0, len(embeddings)):
            words_np.append(embeddings[i])
            words_label.append(reversed_dictionary[i][0])

        pca = PCA(n_components=2)
        pca.fit(words_np)
        reduced = pca.transform(words_np)

        plt.rcParams["figure.figsize"] = (20, 20)
        for index, vec in enumerate(reduced):
            if index < 1000:
                x, y = vec[0], vec[1]
                plt.scatter(x, y)
                plt.annotate(words_label[index], xy=(x, y))
        plt.show()
Пример #3
0
from sklearn.decomposition import PCA
from sklearn import decomposition
from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import Imputer

features = np.loadtxt("features.dat", unpack=True)
response = np.loadtxt("response.dat", unpack=True)

X = np.array(features)
Y = np.array(response)

# print banned_data
pca = PCA(n_components=2)
Y_r = pca.fit(response).transform(response)
X_r = pca.fit(features).transform(features)
print Y_r
plt.figure()
plt.scatter(X_r[:,  0], X_r[:, 1])
plt.title('PCA of dataset')
plt.show()

#
# np.random.seed(5)
#
# centers = [[1, 1], [-1, -1], [1, -1]]
# features = datasets.x()
# X = features.data
# y = features.target
#
Пример #4
0
    'default_spread', 'log_trade_value', 'res_day', 'vol', 'extreme_2', 'skew',
    'total_size', 'coupon'
]

data = monthly_cb_value[factor_list]

data = data.fillna(0)
'''
PCA 方法1: 直接用sklearn的包
优势:用SVD降维,更加标准
劣势:由于不知道实际的correlation matrix,不知道该怎么选择component,以及每个component对应的projection 
'''
from sklearn.decomposition import PCA
X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=5)
pca.fit(data)
pca.transform(data)
'''
得到的结果:只有第一个component有意义
pca.explained_variance_ratio_
Out[21]: array([0.94569423, 0.04154984, 0.00570173, 0.00359057, 0.00259915])
'''
'''
PCA 方法2:根据定义自己来进行的PCA
优势:每一步都清楚是怎么做的
劣势:没有采用SVD,只是用correlation matrix来计算
'''
#scale the data(standardize from 0-1)
from sklearn.preprocessing import StandardScaler
factor_std = StandardScaler().fit_transform(data)
Пример #5
0
nirMatrix = numpy.matrix(totalnir)
redMatrix = numpy.matrix(totalred)
grnMatrix = numpy.matrix(totalgrn)

# In[ ]:

matrix = numpy.hstack((nirMatrix, redMatrix, grnMatrix))

# In[ ]:

matrix

# In[ ]:

pca = PCA()
pca.fit(matrix)
transform = pca.transform(matrix)

# In[ ]:

transform

# In[ ]:

#Nir
pca1 = transform[:, 0]
zeroNir = pca1 < -.14
oneNir = pca1 > -.13
pca1[zeroNir] = -2000
pca1[oneNir] = 0
Пример #6
0
for i in trans_1:
    t = sum(i) / float(len(i))
    avg_1.append(t)
dif = 0.0
for i in range(len(data_1[0])):
    dif += (avg_1[i] - avg_2[i])**2
dif = dif**0.5
print("Difference : ", dif)

#print(len(data_1[0])==len(data_2[0]))
# print(len(data_2[0]))
raw_input("Press enter to generate graph")

ipca = PCA(n_components=3)

ipca.fit(data_1)
x_1 = ipca.transform(data_1)

ipca.fit(data_2)
x_2 = ipca.transform(data_2)

Xs = []
Ys = []
Zs = []
for i in x_1[0:50]:
    Xs.append(i[0])
    Ys.append(i[1])
    Zs.append(i[2])

ax.scatter(Xs, Ys, Zs, c='r', marker='o')
Пример #7
0
#            q[j] = 0
#    label[i] = q

# print(q.shape)
# if np.isnan(0) == False:
#     print(100)

print(ddd.shape)

ddd_scaled = scale(ddd)

print(ddd_scaled.shape)

# a= np.array(data[])
pca = PCA(n_components=300)
results = pca.fit(ddd_scaled)

print(pca.explained_variance_ratio_)

a = pca.fit_transform(ddd_scaled)

aa = pd.Series(a[:, 0].tolist())
data["PCA1"] = aa.values
aa = pd.Series(a[:, 1].tolist())
data["PCA2"] = aa.values
aa = pd.Series(a[:, 2].tolist())
data["PCA3"] = aa.values
aa = pd.Series(a[:, 3].tolist())
data["PCA4"] = aa.values
aa = pd.Series(a[:, 4].tolist())
data["PCA5"] = aa.values