def maketextveccluster2(doc): keylist = doc.keys() docvec = {} docvec2 = {} docvec3 = {} for key in keylist: wordlist = doc[key] word2freq = morphological_analysis(wordlist) voclist = word2freq.keys() N = sum(np.array(word2freq.values()) + 1) M = np.array([wordvec(k) for k in voclist]) ClusterNumber = int((int(np.log(len(voclist)))+ 1)) kmeans = KMeans(n_clusters=ClusterNumber, random_state=1) kmeans_model = kmeans.fit(M) vectorMat = kmeans_model.cluster_centers_ labels = kmeans_model.labels_ labels = np.array(labels) veclist = [] veclist2 = [] veclist3 = [] for k in xrange(len(vectorMat)): v = (vectorMat[k]/np.linalg.norm(vectorMat[k])) v2 = v * ((np.log(len(labels[labels == k]) + 1))/np.log(N)) v3 = v * (float(len(labels[labels == k]))/float(sum(np.array(word2freq.values())))) veclist.append(v) veclist2.append(v2) veclist3.append(v3) docvec[key] = np.array(veclist) docvec2[key] = np.array(veclist2) docvec3[key] = np.array(veclist3) return docvec,docvec2,docvec3
model = word2vec.Word2Vec.load("Topnewsmodel200.model") #model = word2vec.Word2Vec.load("alltext200.model") model = word2vec.Word2Vec.load("Topnewsmodel200_2.model") voclist = model.vocab.keys() veclist = {} for k in voclist: veclist[k] = np.array(model[k]/np.linalg.norm(model[k])) #veclist[k] = np.array(model[k]) M = np.array(veclist.values()) features = M kmeans = KMeans(n_clusters=DimentionN, random_state=100) #kmeans = KMeans(n_clusters=500, random_state=100) #kmeans_model1000 = kmeans.fit(features) kmeans_model1000 = kmeans.fit(features) #xmeans利用時 #from xmeans import XMeans #x_means = XMeans(random_state = 1).fit(M) #labels = x_means.labels_ #features = features500 kmeans_model = kmeans_model1000 labels = kmeans_model.labels_ d = zip(labels, features) voclist = veclist.keys() word2vecdic = dict(zip(voclist, labels)) def maketext(textid,Folda = "businesstexts"): filename = (Folda + "/" + str(textid)) f = open(filename) data = f.read()
#word2vecdic = pickle.load(open("yahookanjokaiseki/word2vecdic.dump")) #word2vecdic = pickle.load(open("word2vecdic_500_2013_2014_2015.dump","r")) #word2vecdic = pickle.load(open("word2vecdic_500_2013_2014_2015_normed.dump","r")) #極性word2vecを使う場合 model = word2vec.Word2Vec.load_word2vec_format("encow14a_200_antsyn_noun_adj_verb.bin",binary = True) voclist = model.vocab.keys() veclist = {} for k in voclist: veclist[k] = np.array(model[k]/np.linalg.norm(model[k])) M = np.array(veclist.values()) random_state = 10 DimentionN = 500 kmeans = KMeans(n_clusters=DimentionN, random_state=random_state) kmeans_model = kmeans.fit(M) labels = kmeans_model.labels_ #kmeans_model.predict(M[0]) d = zip(labels, M) voclist_ID = veclist.keys() wordIDdic = pickle.load(open("../../AntSynDistinction/corpus/wordIDdic.dump")) IDwordIDdic = {} for key in wordIDdic.keys(): IDwordIDdic[wordIDdic[key]] = key voclist = [] for ID in voclist_ID: if ID != u"</s>": voclist.append(IDwordIDdic[int(ID)]) else: voclist.append("</s>")