def wordsim(self, path = "wordsim/wordsim353/combined.tab"): (pairs, scores) = self.loadCorpus(path) #m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} m = Word2Vec.load_word2vec_format("../google_data/GoogleNews-vectors-negative300.bin.gz", binary=True) print "--- Original Pairs: ---" for pair in pairs: print pair words = set(m.index2word) (pairs,nums) = self.checkWords(m, pairs) print "--- After Matching: ---" ### For WS dataset. #nums = [0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32, 36, 37, 40, 43, 44, 49, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 70, 74, 75, 83, 84, 85, 86, 88, 90, 94, 96, 97, 98, 99, 100, 102, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 141, 142, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 161, 162, 163, 164, 165, 169, 171, 173, 174, 177, 178, 183, 184, 188, 190, 191, 194, 197, 198, 206, 210, 213, 214, 218, 219, 220, 221, 224, 225, 226, 227, 228, 230, 235, 238, 242, 247, 255, 256, 257, 259, 260, 267, 269, 273, 275, 277, 278, 279, 280, 282, 285, 286, 287, 288, 289, 291, 296, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 317, 318, 320, 321, 324, 325, 332, 334, 335, 336, 340, 343, 344, 347, 348, 350, 351, 352] print nums print "Original Number of Words",len(pairs) for pair in pairs: print pair matched_pairs = [pairs[num] for num in nums] matched_scores = [scores[num] for num in nums] print "--- After deleting unmatched: ---" print "Number of remaining words", len(matched_pairs) print matched_pairs print matched_scores cosine_scores = [] for tmp in matched_pairs: cosine = 1 - spatial.distance.cosine(m[tmp[0]], m[tmp[1]]) cosine_scores.append(cosine) print "--- After calculating cosine scores:--- " print cosine_scores print "--- Spearman Corelation ---" print stats.spearmanr(matched_scores, cosine_scores) print stats.pearsonr(matched_scores, cosine_scores)
def wordsim(self, path, vectorPath): (pairs, scores) = self.loadCorpus(path) m = Word2Vec.load_word2vec_format(vectorPath, binary=True) print "--- Original Pairs: ---" for pair in pairs: print pair logging.warn("Loading completed") words = m.index2word (pairs,nums) = self.checkWords(m, pairs) print "--- After Matching: ---" print nums print "Original Number of Words",len(pairs) for pair in pairs: print pair matched_pairs = [pairs[num] for num in nums] matched_scores = [scores[num] for num in nums] print "--- After deleting unmatched: ---" print "Number of remaining words", len(matched_pairs) print matched_pairs print matched_scores cosine_scores = [] for tmp in matched_pairs: cosine = 1 - spatial.distance.cosine(m[tmp[0]], m[tmp[1]]) cosine_scores.append(cosine) print "--- After calculating cosine scores:--- " print cosine_scores print "--- Spearman Corelation ---" s = stats.spearmanr(matched_scores, cosine_scores) p = stats.pearsonr(matched_scores, cosine_scores) print s print p return (s,p)
def wordsim(self, path="wordsim/wordsim353/combined.tab"): (pairs, scores) = self.loadCorpus(path) #m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} m = Word2Vec.load_word2vec_format( "../google_data/GoogleNews-vectors-negative300.bin.gz", binary=True) print "--- Original Pairs: ---" for pair in pairs: print pair words = set(m.index2word) (pairs, nums) = self.checkWords(m, pairs) print "--- After Matching: ---" ### For WS dataset. #nums = [0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 16, 17, 19, 23, 24, 25, 27, 28, 29, 30, 31, 32, 36, 37, 40, 43, 44, 49, 54, 55, 56, 57, 58, 59, 60, 61, 62, 65, 70, 74, 75, 83, 84, 85, 86, 88, 90, 94, 96, 97, 98, 99, 100, 102, 107, 109, 110, 111, 112, 113, 114, 115, 116, 117, 119, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 135, 136, 137, 141, 142, 146, 147, 148, 150, 151, 152, 153, 154, 155, 156, 161, 162, 163, 164, 165, 169, 171, 173, 174, 177, 178, 183, 184, 188, 190, 191, 194, 197, 198, 206, 210, 213, 214, 218, 219, 220, 221, 224, 225, 226, 227, 228, 230, 235, 238, 242, 247, 255, 256, 257, 259, 260, 267, 269, 273, 275, 277, 278, 279, 280, 282, 285, 286, 287, 288, 289, 291, 296, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 314, 317, 318, 320, 321, 324, 325, 332, 334, 335, 336, 340, 343, 344, 347, 348, 350, 351, 352] print nums print "Original Number of Words", len(pairs) for pair in pairs: print pair matched_pairs = [pairs[num] for num in nums] matched_scores = [scores[num] for num in nums] print "--- After deleting unmatched: ---" print "Number of remaining words", len(matched_pairs) print matched_pairs print matched_scores cosine_scores = [] for tmp in matched_pairs: cosine = 1 - spatial.distance.cosine(m[tmp[0]], m[tmp[1]]) cosine_scores.append(cosine) print "--- After calculating cosine scores:--- " print cosine_scores print "--- Spearman Corelation ---" print stats.spearmanr(matched_scores, cosine_scores) print stats.pearsonr(matched_scores, cosine_scores)
def conceptcat(self, path = INPUTPATH): #m = Word2Vec.load_word2vec_format("vectors/entity_vectors_880000", binary=True) m = Word2Vec.load_word2vec_format("../google_data/GoogleNews-vectors-negative300.bin.gz", binary=True) #m = Word2Vec.load_word2vec_format("../google_data/freebase-vectors-skipgram1000-en.bin.gz", binary=True) (concepts, labels) = self.loadData(path, m) #concepts = map(lambda x:"e_"+x, concepts) #labels = map(lambda x:"c_"+x, labels) print "Total number of concepts:", len(concepts) print "Total number of categories:", len(set(labels)) #m = self.loadVector("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #m = Word2Vec.load_word2vec_format("vectors/whole/new_c_e_train_neg10size400min_count1", binary=True) #m = self.loadVector("vectors/whole/new_c_e_train_neg10size400min_count1.embedding", "vectors/whole/new_c_e_train_neg10size400min_count1.list") words = set(m.index2word) for concept in concepts: if concept not in words: print concept for label in set(labels): if label not in words: print label pre_labels = self.cat_predict(concepts, labels, m) kmeans_pre_labels = self.kmeans(concepts, labels, m, method = 'agg_ward') print "Gold Standard:",labels print "Predicted by nearest vectors:",pre_labels print "Predicted by K-means:",kmeans_pre_labels purity = self.purity(labels, pre_labels) accuracy = self.accuracy(labels, pre_labels) print "Purity:", purity print "Accuracy:", accuracy print "K-means Purity:", self.purity(labels, kmeans_pre_labels) print "K-means Accuracy:", self.accuracy(labels, kmeans_pre_labels)
def __init__(self, inter_filePath = "inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format("vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array(self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array(self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df = 1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform(correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def convert(fileName): m = Word2Vec.load_word2vec_format(fileName, binary=True) words = m.index2word # print words list_file = codecs.open(fileName + '.list', 'w', encoding='utf-8') embedding_file = codecs.open(fileName + '.embedding', 'w', encoding = 'utf-8') for word in words: list_file.write(word + '\n') em = " ".join(map(str, list(m[word]))) embedding_file.write(em + '\n')
def __init__(self, vectorfile, category_page_file, dim): #self.m = gensim.models.Word2Vec.load_word2vec_format(vectorfile, binary=True) self.m = Word2Vec.load_word2vec_format(vectorfile, binary=True) self.dim = dim # number of dimension self.correct_categories = [] # [[cat,cat,...]...] self.predict_categories = [] self.context_categories = [] # [[cat,cat...]...] self.predict_vectors = [] #[vector, vector ... ] self.entities = [] #[entity, entity ...] self.load_category_page(category_page_file) self.load_entity_page("inter/entity_page_name.txt") self.cal_predict_vector()
def conceptcat(self, vectorPath, path=INPUTPATH): (concepts, labels) = self.loadData(path) concepts = map(lambda x: "e_" + x, concepts) labels = map(lambda x: "c_" + x, labels) print "Total number of concepts:", len(concepts) print "Total number of categories:", len(set(labels)) #m = self.loadVector("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") m = Word2Vec.load_word2vec_format(vectorPath, binary=True) #m = self.loadVector("vectors/whole/new_c_e_train_neg10size400min_count1.embedding", "vectors/whole/new_c_e_train_neg10size400min_count1.list") words = set(m.index2word) for concept in concepts: if concept not in words: print concept for label in set(labels): if label not in words: print label pre_labels = self.cat_predict(concepts, labels, m) kmeans_pre_labels = self.kmeans(concepts, labels, m, method='agg_ward') ### for paper: draw form of predicting result pre_labels_ori = map(lambda x: x[2:], pre_labels) labels_ori = map(lambda x: x[2:], labels) concepts_ori = map(lambda x: x[2:], concepts) d = defaultdict(list) f = open('form_result', 'w') print len(concepts_ori) for i in xrange(len(concepts_ori)): if pre_labels_ori[i] != labels_ori[i]: f.write(concepts_ori[i] + " " + pre_labels_ori[i] + " " + labels_ori[i] + "\n") d[kmeans_pre_labels[i]].append(concepts_ori[i]) print d f2 = open('form_result_clustering', 'w') for key, value in d.iteritems(): f2.write(str(key) + " ") f2.write(" ".join(value)) f2.write('\n') f.close() f2.close() ### print "Gold Standard:", labels print "Predicted by nearest vectors:", pre_labels print "Predicted by K-means:", kmeans_pre_labels purity = self.purity(labels, pre_labels) accuracy = self.accuracy(labels, pre_labels) print "Purity:", purity print "Accuracy:", accuracy print "K-means Purity:", self.purity(labels, kmeans_pre_labels) print "K-means Accuracy:", self.accuracy(labels, kmeans_pre_labels)
def __init__( self, inter_filePath="inter/technology_companies_of_the_united_states/"): # [[cat,cat...]...] self.m = Word2Vec.load_word2vec_format( "vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5", binary=True) self.dim = 400 (correct_categories_train, context_categories_train ) = self.load_category_page(inter_filePath + "category_page.txt") (correct_categories_test, context_categories_test ) = self.load_category_page(inter_filePath + "category_page_test.txt") ## ---- By mean --- Xvectors = np.array( self.predict_vector_by_mean(context_categories_train)) Xvectors_test = np.array( self.predict_vector_by_mean(context_categories_test)) ## ---- By mean --- * ## ---- By SVM --- corpus_train = [" ".join(i) for i in context_categories_train] corpus_test = [" ".join(i) for i in context_categories_test] cv = CountVectorizer(min_df=1) X = cv.fit_transform(corpus_train) ##TFIDF transformer = TfidfTransformer() X_tfidf = transformer.fit_transform(X) #Labels mlb = MultiLabelBinarizer() mlb.fit(correct_categories_train + correct_categories_test) Y = mlb.transform( correct_categories_train) ###Transform to multilabel indicator #predict test labels X_test = cv.transform(corpus_test) Y_test = mlb.transform(correct_categories_test) #Y_predict_ovr = self.ovrSVM(X, Y, X_test) Y_predict_ovr = self.ovrSVM(Xvectors, Y, Xvectors_test) #Y_predict_ovo = self.ovoSVM(X, Y, X_test) print "---One versus rest---" print "Macro F-1:", f1_score(Y_test, Y_predict_ovr, average='macro') print "Micro F-1:", f1_score(Y_test, Y_predict_ovr, average='micro')
def classify(self, filePath, vectorPath): # 1. load Corpus from files (corpus, labels) = self.loadFile(filePath) #(corpus, labels) = self.loadFile("dataless/20NG/20ng-train-no-stop.txt") #d = defaultdict(Counter) #for i in xrange(len(corpus)): # d[labels[i]].update(corpus[i].split()) #for key,value in d.iteritems(): # print key # print value.most_common(10) labels = self.transform_labels(labels) candidate_labels = set(labels) most_freq_label = max(set(labels), key=labels.count) print "candidate labels:",candidate_labels print "most freq label:",most_freq_label #m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} m = Word2Vec.load_word2vec_format(vectorPath, binary=True) # 2. Encode Feature Matrix cv = CountVectorizer(min_df=1) X = cv.fit_transform(corpus) # Frequency #print "Frequency:",X #print cv.get_feature_names() #transformer = TfidfTransformer() #X = transformer.fit_transform(X) # TF-IDF weighted entities #print "Tf-idf:",X # 3. calculate final vectors to predict labels # print X[0]for x in X[0]: pre_vectors = self.pre_vectors(X, cv ,m) #print pre_vectors # 4. find predict labels from candidate labels closest to pre_vectors pre_labels = self.pre_labels(pre_vectors, candidate_labels,most_freq_label, m) # print pre_labels # 5. calculate micro-f1 score micro_f1 = f1_score(labels, pre_labels, average='micro') macro_f1 = f1_score(labels, pre_labels, average='macro') print "Micro-F1:",micro_f1 print "Macro-F1:",macro_f1 print "candidate labels", list(candidate_labels) print "Micro-F1 for each label:", precision_recall_fscore_support(labels, pre_labels, labels=list(candidate_labels), average=None)
from pyemd import emd import os import numpy as np import codecs from sklearn.metrics import euclidean_distances from sklearn.feature_extraction.text import CountVectorizer logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) W2VFilePath = "../we/w2v/GoogleNews-vectors-negative300.bin.gz" DATAFilePath = "../we_data/embed.dat" VOCABFilePath = "../we_data/embed.vocab" if not os.path.exists(DATAFilePath): print("Caching word embeddings in memmapped format...") from word2vec_music import Word2Vec wv = Word2Vec.load_word2vec_format(W2VFilePath, binary=True) fp = np.memmap(DATAFilePath, dtype=np.double, mode='w+', shape=wv.syn0norm.shape) fp[:] = wv.syn0norm[:] with codecs.open(VOCABFilePath, "w", encoding='utf-8') as f: for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()): f.write(w + '\n') del fp, wv logging.warning("Loading Word Embedding") W = np.memmap(DATAFilePath, dtype=np.double, mode="r", shape=(3000000, 300)) vocab_list = [] with codecs.open(VOCABFilePath, "r", encoding = 'utf-8') as f: for w in f: vocab_list.append(w.strip()) #vocab_list = map(str.strip, f.readlines()) vocab_dict = {w: k for k, w in enumerate(vocab_list)} logging.warning("Word Embedding Loaded")
from pyemd import emd import os import numpy as np import codecs from sklearn.metrics import euclidean_distances from sklearn.feature_extraction.text import CountVectorizer logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) W2VFilePath = "../we/w2v/GoogleNews-vectors-negative300.bin.gz" DATAFilePath = "../we_data/embed.dat" VOCABFilePath = "../we_data/embed.vocab" if not os.path.exists(DATAFilePath): print("Caching word embeddings in memmapped format...") from word2vec_music import Word2Vec wv = Word2Vec.load_word2vec_format(W2VFilePath, binary=True) fp = np.memmap(DATAFilePath, dtype=np.double, mode='w+', shape=wv.syn0norm.shape) fp[:] = wv.syn0norm[:] with codecs.open(VOCABFilePath, "w", encoding='utf-8') as f: for _, w in sorted( (voc.index, word) for word, voc in wv.vocab.items()): f.write(w + '\n') del fp, wv logging.warning("Loading Word Embedding") W = np.memmap(DATAFilePath, dtype=np.double, mode="r", shape=(3000000, 300)) vocab_list = [] with codecs.open(VOCABFilePath, "r", encoding='utf-8') as f: for w in f:
from word2vec_music import Word2Vec from pre_process import Pre_process from time import time import numpy as np import random import logging import sys logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) f = open('dota1000','r') categories = [] for l in f: categories.append('c_'+l.split(' ')[0].strip()) print categories print len(categories) m = Word2Vec.load_word2vec_format("vectors/whole/new_c_e_train_neg10size400min_count1", binary=True) for category in categories: if category not in m: print category, "not in vectors" for category in categories: x = map(lambda x: x[0],m.most_similar(positive = [category], topn = 500)) x = [entry for entry in x if entry[0]=='e' and "_" not in entry[2:] and "(" not in entry] ##get nearest entities print "-------category:--------" print category print "-------entities:--------" print x
from matplotlib import offsetbox from sklearn import manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) if len(sys.argv) != 3: print "Usage : python tsne.py vectorfile savePath" print "e.g. : python tsne.py vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5 tsne.png" print "e.g. : python tsne.py vectors/technology_companies_based_in_california/cat_train_neg5size400min_count5 tsne.png" sys.exit(1) reload(sys) sys.setdefaultencoding("utf-8") vectorFile = sys.argv[1] # digits = datasets.load_digits(n_class=6) ## Add my own data m = Word2Vec.load_word2vec_format(vectorFile, binary=True) words = m.index2word ### Random sample 100 categories categories = [word for word in words if word[0] == "c"] # print categories samples = random.sample(categories, 3) y = [] y.extend(samples) for sample in samples: x = map(lambda x: x[0], m.most_similar(positive=[sample], topn=10)) y.extend(x) ### Get 10 nearest neighbors for 100 categories X = m[y] n_samples, n_features = X.shape # n_neighbors = 30
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if len(sys.argv) != 3: print "Usage : python tsne.py vectorfile savePath" print "e.g. : python tsne.py vectors/technology_companies_of_the_united_states/cat_train_neg5size400min_count5 tsne.png" print "e.g. : python tsne.py vectors/technology_companies_based_in_california/cat_train_neg5size400min_count5 tsne.png" sys.exit(1) reload(sys) sys.setdefaultencoding('utf-8') vectorFile = sys.argv[1] #digits = datasets.load_digits(n_class=6) ## Add my own data m = Word2Vec.load_word2vec_format(vectorFile, binary=True) words = m.index2word ### Random sample 100 categories categories = [word for word in words if word[0] == "c"] #print categories samples = random.sample(categories, 3) y = [] y.extend(samples) for sample in samples: x = map(lambda x: x[0], m.most_similar(positive=[sample], topn=10)) y.extend(x) ### Get 10 nearest neighbors for 100 categories X = m[y] n_samples, n_features = X.shape
from pre_process import Pre_process from time import time import numpy as np import random import logging import sys logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) f = open('dota1000', 'r') categories = [] for l in f: categories.append('c_' + l.split(' ')[0].strip()) print categories print len(categories) m = Word2Vec.load_word2vec_format( "vectors/whole/new_c_e_train_neg10size400min_count1", binary=True) for category in categories: if category not in m: print category, "not in vectors" for category in categories: x = map(lambda x: x[0], m.most_similar(positive=[category], topn=500)) x = [ entry for entry in x if entry[0] == 'e' and "_" not in entry[2:] and "(" not in entry ] ##get nearest entities print "-------category:--------" print category print "-------entities:--------" print x