def get_vector_pairs(w, X, y, dataset='simlex', save=True): if isinstance(w, dict): w = list(w.values()) names = list(w.keys()) w_source = Embedding.from_dict(w[0]) w_target = Embedding.from_dict(w[1]) missing_words = 0 source_words = w['source'].vocabulary.word_id for query in X: for query_word in query: if query_word not in source_words: missing_words += 1 if missing_words > 0: logger.warning("Missing {} source words. Will replace them with mean vector".format(missing_words)) mean_vector_source = np.mean(w['source'].vectors, axis=0, keepdims=True) mean_vector_target = np.mean(w['target'].vectors, axis=0, keepdims=True) x = list(set(list(X[:,0])+list(X[:,1]))) x1 = np.vstack(w['source'].get(word, mean_vector_source) for word in x) x2 = np.vstack(w['target'].get(word, mean_vector_target) for word in x) if save: filename = w['source']+'2'+w['target']+ '_'+dataset word2vec = {'source':{},'target':{}} save_vectors(filename) return (x1, x2)
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) total_words = 0 missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: total_words += 1 if query_word not in words: missing_words += 1 if missing_words > 0: logger.info( "Missing {} words out of {} total words in test ({}% of words are missing)." .format(missing_words, total_words, missing_words / total_words * 100.0)) #logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) ''' mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation ''' words = zip(X[:, 0], X[:, 1]) A = [] B = [] new_y = [] for (w1, w2), score in zip(words, y): if w1 in w and w2 in w: A.append(w[w1]) B.append(w[w2]) new_y.append(score) A = np.vstack(A) B = np.vstack(B) y = np.vstack(new_y) assert len(A) == len(B) == len(y) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def checkpoint(self, epoch, sess): """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ embed_matrix, \ context_matrix = sess.run([self.embed_matrix, self.context_matrix]) voc2vec = {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) self.logger.info('Current Score: {}'.format(curr_int)) if curr_int > self.best_int_avg: self.logger.info("Saving embedding matrix") f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w') for id, wrd in self.id2voc.items(): f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()]))) self.saver.save(sess=sess, save_path=self.save_path) self.best_int_avg = curr_int
def test_standardize_preserve_identity(): d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]} w3 = Embedding.from_dict(d) w4 = w3.standardize_words(inplace=False, lower=True) assert w4['spider'][0] == 1 w3.standardize_words(inplace=True, lower=True) assert w3['spider'][0] == 1
def evaluate_ana(wv, w2i, vocab): W_norm = np.zeros(wv.shape) d = (np.sum(wv**2, 1)**(0.5)) W_norm = (wv.T / d).T evaluate_analogy_msr(W_norm, w2i) evaluate_analogy_google(W_norm, w2i) wv_dict = dict() for w in vocab: wv_dict[w] = W_norm[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) evaluate_analogy_semeval2012(w) # analogy_tasks = { # "Google": fetch_google_analogy(), # "MSR": fetch_msr_analogy() # } # analogy_results = {} # for name, data in iteritems(analogy_tasks): # analogy_results[name] = evaluate_analogy(w, data.X, data.y) # print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
def evaluate_synonyms(e, problems): correct = 0 total = 0 #debugging... if not e: all_words = np.concatenate([[q] + o for q, o, _ in problems]) e = Embedding.from_dict({w: np.random.random(10) for w in all_words}) meanvec = np.mean(e.vectors, axis=0) # with open('synonyms_test_words', 'a') as testw: for question, options, answer in problems: # testw.write('\n'.join(options+[question])+'\n') if question in e: print('question: ' + question) print(options) q_v = e[question].reshape(1, -1) q_ops = np.vstack( [e[op] if op in e else meanvec for op in options]) distances = cdist(q_v, q_ops, metric='cosine')[0] selected = np.argsort(distances)[0] if selected == answer: correct += 1 total += 1 score = correct * 1. / total return score
def evaluate_on_all(w): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = {"WS353": fetch_WS353()} similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format( name, similarity_results[name])) sim = pd.DataFrame([similarity_results]) results = sim return results
def evaluate_similarity(w, X, y, restrict_to_words=None): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ from web.embedding import Embedding if isinstance(w, dict): w = Embedding.from_dict(w) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_simi(wv, w2i, vocab): wv_dict = dict() for w in vocab: wv_dict[w] = wv[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) # Calculate results on similarity print("Calculating similarity benchmarks") similarity_tasks = { "WS353": fetch_WS353(), "RG65": fetch_RG65(), # "WS353R": fetch_WS353(which="relatedness"), # "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "MTurk": fetch_MTurk(), "RW": fetch_RW(), "MEN": fetch_MEN(), } # similarity_results = {} for name, data in iteritems(similarity_tasks): print( "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}" .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0])) score = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, score))
def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. method : {"add", "mul"} Method to use when finding analogy answer, see "Improving Distributional Similarity with Lessons Learned from Word Embeddings" X : array-like, shape (n_samples, 3) Analogy questions. y : array-like, shape (n_samples, ) Analogy answers. k : int, default: None If not None will select k top most frequent words from embedding batch_size : int, default: 100 Increase to increase memory consumption and decrease running time category : list, default: None Category of each example, if passed function returns accuracy per category in addition to the overall performance. Analogy datasets have "category" field that can be supplied here. Returns ------- result: dict Results, where each key is for given category and special empty key "" stores summarized accuracy across categories """ if isinstance(w, dict): w = Embedding.from_dict(w) assert category is None or len(category) == y.shape[0], "Passed incorrect category list" solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k) y_pred = solver.predict(X) if category is not None: results = OrderedDict({"all": np.mean(y_pred == y)}) count = OrderedDict({"all": len(y_pred)}) correct = OrderedDict({"all": np.sum(y_pred == y)}) for cat in set(category): results[cat] = np.mean(y_pred[category == cat] == y[category == cat]) count[cat] = np.sum(category == cat) correct[cat] = np.sum(y_pred[category == cat] == y[category == cat]) return pd.concat([pd.Series(results, name="accuracy"), pd.Series(correct, name="correct"), pd.Series(count, name="count")], axis=1) else: return np.mean(y_pred == y)
def evaluate_on_semeval_2012_2(w): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. Returns ------- result: pandas.DataFrame Results with spearman correlation per broad category with special key "all" for summary spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_semeval_2012_2() mean_vector = np.mean(w.vectors, axis=0, keepdims=True) categories = data.y.keys() results = defaultdict(list) for c in categories: # Get mean of left and right vector prototypes = data.X_prot[c] prot_left = np.mean( np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0, ) prot_right = np.mean( np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0, ) questions = data.X[c] question_left, question_right = ( np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), np.vstack(w.get(word, mean_vector) for word in questions[:, 1]), ) scores = np.dot(prot_left - prot_right, (question_left - question_right).T) c_name = data.categories_names[c].split("_")[0] # NaN happens when there are only 0s, which might happen for very rare words or # very insufficient word vocabulary cor = scipy.stats.spearmanr(scores, data.y[c]).correlation results[c_name].append(0 if np.isnan(cor) else cor) final_results = OrderedDict() final_results["all"] = sum(sum(v) for v in results.values()) / len(categories) for k in results: final_results[k] = sum(results[k]) / len(results[k]) return pd.Series(final_results)
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 # if missing_words > 0: # print("Missing {} words. Will replace them with mean vector".format(missing_words)) new_x = [] new_y = [] exist_cnt = 0 for i in range(len(X)): if X[i, 0] in words and X[i, 1] in words: new_x.append(X[i]) new_y.append(y[i]) exist_cnt += 1 print('exist {} in {}'.format(exist_cnt, len(X))) X = np.array(new_x) y = np.array(new_y) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) # scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y, missing_words='mean'): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) n_missing_words = count_missing_words(w, X) if n_missing_words > 0: logger.warning("Missing {} words.".format(n_missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A, B = [], [] if missing_words == 'mean' or n_missing_words == 0: if n_missing_words: logger.info( "Will replace them with mean vector".format(missing_words)) A = [w.get(word, mean_vector) for word in X[:, 0]] B = [w.get(word, mean_vector) for word in X[:, 1]] elif missing_words == 'filter_out': logger.info("Will ignore them") y_filtered = [] for x, gt in zip(X, y): a, b = x if a not in w or b not in w: continue A.append(w.get(a, mean_vector)) B.append(w.get(b, mean_vector)) y_filtered.append(gt) y = np.asarray(y_filtered) #A = np.asarray([w.get(word, mean_vector) for word in X[:, 0]]) #B = np.asarray([w.get(word, mean_vector) for word in X[:, 1]]) scores = np.array([cosine_similarity(v1, v2) for v1, v2 in zip(A, B)]) #scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) #avs: mean_vector for missing_words mean_vector = np.mean(w.vectors, axis=0, keepdims=True) #avs: getting the vector for each word A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) #avs: calculate the cosine distance between the 2 vectores # why v1.dot(v2.T): because we are working with matrixes !!! http://www.thefactmachine.com/cosine-similarity/ scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_word_analogy(wv, w2i, vocab): W_norm = np.zeros(wv.shape) d = (np.sum(wv**2, 1)**(0.5)) W_norm = (wv.T / d).T evaluate_analogy_msr(W_norm, w2i) evaluate_analogy_google(W_norm, w2i) wv_dict = dict() for w in vocab: wv_dict[w] = W_norm[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) evaluate_analogy_semeval2012(w)
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float: """ Computes intrinsic scores for embeddings and dumps the embeddings embeddings Parameters ---------- epoch: Current epoch number sess: Tensorflow session object Returns ------- """ voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()} embedding = Embedding.from_dict(voc2vec) results = evaluate_on_all(embedding) results = {key: round(val[0], 4) for key, val in results.items()} curr_int = np.mean(list(results.values())) return curr_int
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id idx = set() for i, query in enumerate(X): missing_words = 0 for query_word in query: if query_word not in words: missing_words += 1 break if missing_words == 0: idx.add(i) idx = list(idx) y = y[idx] X = X[idx] A = np.vstack(w.get(word) for word in X[:, 0]) B = np.vstack(w.get(word) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id idx = [] for i, query in enumerate(X): if query[0] not in words or query[1] not in words: missing_words += 1 else: idx.append(i) if missing_words > 0: logger.warning("Missing {} pairs. ".format(missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) print("norms", np.linalg.norm(A[0]), len(idx)) #print("scores",np.count_nonzero(np.isnan(A))) return scipy.stats.spearmanr(scores[idx], y[idx]).correlation
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict'], "Unrecognized format" if format == "word2vec_bin": w = Embedding.from_word2vec(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname, "rb"), encoding='latin1') w = Embedding.from_dict(d) if normalize: w.normalize_words(inplace=True) if lower or clean_words: w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def evaluate_on_semeval_2012_2(w): """ Simple method to score embedding using SimpleAnalogySolver Parameters ---------- w : Embedding or dict Embedding or dict instance. Returns ------- result: pandas.DataFrame Results with spearman correlation per broad category with special key "all" for summary spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_semeval_2012_2() mean_vector = np.mean(w.vectors, axis=0, keepdims=True) categories = data.y.keys() results = defaultdict(list) for c in categories: # Get mean of left and right vector prototypes = data.X_prot[c] prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0) prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0) questions = data.X[c] question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \ np.vstack(w.get(word, mean_vector) for word in questions[:, 1]) scores = np.dot(prot_left - prot_right, (question_left - question_right).T) c_name = data.categories_names[c].split("_")[0] # NaN happens when there are only 0s, which might happen for very rare words or # very insufficient word vocabulary cor = scipy.stats.spearmanr(scores, data.y[c]).correlation results[c_name].append(0 if np.isnan(cor) else cor) final_results = OrderedDict() final_results['all'] = sum(sum(v) for v in results.values()) / len(categories) for k in results: final_results[k] = sum(results[k]) / len(results[k]) return pd.Series(final_results)
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words)) #avs: mean_vector for missing_words mean_vector = np.mean(w.vectors, axis=0, keepdims=True) #avs: getting the vector for each word A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) #avs: calculate the cosine distance between the 2 vectores # why v1.dot(v2.T): because we are working with matrixes !!! http://www.thefactmachine.com/cosine-similarity/ scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_cate(wv, w2i, vocab, method="all", seed=None): """ method: string, default: "all" What method to use. Possible values are "agglomerative", "kmeans", "all. If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude hyperparameter tuning to avoid overfitting). If "kmeans" is passed, method will fit KMeans. In both cases number of clusters is preset to the correct value. seed: int, default: None Seed passed to KMeans. """ wv_dict = dict() for w in vocab: wv_dict[w] = wv[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) # Calculate results on categorization print("Calculating categorization benchmarks") categorization_tasks = { "AP": fetch_AP(), "ESSLI_2c": fetch_ESSLI_2c(), "ESSLI_2b": fetch_ESSLI_2b(), "ESSLI_1a": fetch_ESSLI_1a(), "Battig": fetch_battig(), "BLESS": fetch_BLESS(), } categorization_results = {} # Calculate results using helper function for name, data in iteritems(categorization_tasks): print( "Sample data from {}, num of samples: {} : \"{}\" is assigned class {}" .format(name, len(data.X), data.X[0], data.y[0])) categorization_results[name] = evaluate_categorization(w, data.X, data.y, method=method, seed=None) print("Cluster purity on {} {}".format(name, categorization_results[name]))
def evaluate_similarity(w, X, y): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words = 0 words = w.vocabulary.word_id for query in X: for query_word in query: if query_word not in words: missing_words += 1 if missing_words > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack([w.get(word, mean_vector) for word in X[:, 0]]) B = np.vstack([w.get(word, mean_vector) for word in X[:, 1]]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) return scipy.stats.spearmanr(scores, y).correlation
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}): """ Evaluate on WordRep dataset Parameters ---------- w : Embedding or dict Embedding or dict instance. max_pairs: int, default: 1000 Each category will be constrained to maximum of max_pairs pairs (which results in max_pair * (max_pairs - 1) examples) solver_kwargs: dict, default: {} Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words in the dictionary. References ---------- Bin Gao, Jiang Bian, Tie-Yan Liu (2015) "WordRep: A Benchmark for Research on Learning Word Representations" """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_wordrep() categories = set(data.category) accuracy = {} correct = {} count = {} for cat in categories: X_cat = data.X[data.category == cat] X_cat = X_cat[0:max_pairs] logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0] , X_cat.shape[0] * (X_cat.shape[0] - 1))) # For each category construct question-answer pairs size = X_cat.shape[0] * (X_cat.shape[0] - 1) X = np.zeros(shape=(size, 3), dtype="object") y = np.zeros(shape=(size,), dtype="object") id = 0 for left, right in product(X_cat, X_cat): if not np.array_equal(left, right): X[id, 0:2] = left X[id, 2] = right[0] y[id] = right[1] id += 1 # Run solver solver = SimpleAnalogySolver(w=w, **solver_kwargs) y_pred = solver.predict(X) correct[cat] = float(np.sum(y_pred == y)) count[cat] = size accuracy[cat] = float(np.sum(y_pred == y)) / size # Add summary results correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories) correct['all'] = sum(correct[c] for c in categories) correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories) count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories) count['all'] = sum(count[c] for c in categories) count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories) accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia'] accuracy['all'] = correct['all'] / count['all'] accuracy['wordnet'] = correct['wordnet'] / count['wordnet'] return pd.concat([pd.Series(accuracy, name="accuracy"), pd.Series(correct, name="correct"), pd.Series(count, name="count")], axis=1)
print("Spearman correlation of scores on {} {}".format( 'WS353', evaluate_similarity(embeddings, X, y))) X, y = fetch_dataset_MEN(MEN) print("Spearman correlation of scores on {} {}".format( 'MEN', evaluate_similarity(embeddings, X, y))) X, y = fetch_dataset_SIM999(SIM999) print("Spearman correlation of scores on {} {}".format( 'SIM999', evaluate_similarity(embeddings, X, y))) if __name__ == '__main__': preprocess_base_dir = '../data/wikipedia/preprocess1B/NumeralAsNumeral' vec_base_dir = '../data/wikipedia/save/1B/prototypes/' # postfix = ['LSTM', 'NumeralAsToken', 'NumeralAsTokenUnkNumeral','NumeralAsUnkNumeral'] # postfix = ['50','100','200','300'] # postfix = ['NumeralAsToken', 'NumeralAsToken3','NumeralAsToken8'] # postfix = ['NumeralAsTokenUnkNumeral5_300'] postfix = ['3', '5'] for p in postfix: vec = glob.glob(vec_base_dir + '/{}/idx2vec_i*.dat'.format(p)) idx2word_path = preprocess_base_dir + '/idx2word.dat' idx2word = pickle.load(open(idx2word_path, 'rb')) for v in vec: print('evaluate vector file {}, in {}'.format(v, p)) idx2vec = pickle.load(open(v, 'rb')) dicts = {idx2word[i]: idx2vec[i] for i in range(len(idx2vec))} embeddings = Embedding.from_dict(dicts) evaluate(embeddings)
def evaluate_similarity(w, X, y, tokenize_oov_words_with_deepcut=False, filter_not_found=False, include_structured_sources=None, cut_letters_for_oov=False, structed_sources_coef=0, numberbatch=None): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings tokenize_oov_words_with_deepcut: if a thai word is not found in the embedding (OOV), tokenize it with deepcut, and try to use the sum vector of its parts? filter_not_found: remove a word pair if one of the words was not found in the embedding vocabulary include_structured_sources: include using structed source. None or name structed_sources_coef: weight for structed source Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words, found_words, oov_vecs_created, index = 0, 0, 0, 0 word_pair_oov_indices = [] info_oov_words = {} info_created_words = {} words = w.vocabulary.word_id ## NEW: use deepcut to create word vectors of word parts -- if possible if tokenize_oov_words_with_deepcut: # a) create set of OOV words in the dataset oov_words = set() for query in X: for query_word in query: if query_word not in words: oov_words.add(query_word) # b) iterate over OOV words and see if we can set a vector from them for ds_word in oov_words: tokens = deepcut.tokenize(ds_word) in_voc_tokens = [tok for tok in tokens if tok in w] ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word if in_voc_tokens: token_vecs = [w.get(t) for t in in_voc_tokens] w[ds_word] = np.mean(token_vecs, axis=0) # print("Created vector for OOV word:", ds_word) oov_vecs_created += 1 info_created_words[ds_word] = in_voc_tokens else: info_oov_words[ds_word] = tokens print('All OOV words after deepcut:') pprint(info_oov_words) print('All "created"/replaced words by deepcut:') pprint(info_created_words) elif cut_letters_for_oov: oov_words = set() #collecting oov words for query in X: for query_word in query: if query_word not in words: oov_words.add(query_word) #iterating through each oov-word for oov_word in oov_words: cut_word = oov_word words_with_same_prefix = set() # cutting letter by letter until we find some words with the same prefix while len(cut_word) and cut_word not in words: cut_word = cut_word[:-1] # collectings words with the same prefix for vocabulary_word in w: if vocabulary_word[0].startswith(cut_word): words_with_same_prefix.add(vocabulary_word[0]) # if found at least one word, then stop cutting and let's compute the avg vector if len(words_with_same_prefix): break print( f'FOR WORD {oov_word} FOUND WORDS WITH THE SAME PREFIX: {str(words_with_same_prefix)}' ) if words_with_same_prefix: token_vecs = [w.get(t) for t in words_with_same_prefix] w[oov_word] = np.mean(token_vecs, axis=0) oov_vecs_created += 1 info_created_words[oov_word] = cut_word ## For all words in the datasets, check if the are OOV? ## Indices of word-pairs with a OOV word are stored in word_pair_oov_indices for query in X: for query_word in query: if query_word not in words: print("Missing Word:", query_word) missing_words += 1 word_pair_oov_indices.append(index) else: print("Found Word:", query_word) found_words += 1 index += 1 word_pair_oov_indices = list(set(word_pair_oov_indices)) print('word_pair_oov_indices', word_pair_oov_indices) if missing_words > 0 or oov_vecs_created > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) logger.warning( "OOV words {} created from their subwords. Will replace them with mean vector of sub-tokens" .format(oov_vecs_created)) logger.warning("Found {} words.".format(found_words)) print('X.shape', X.shape) print('y.shape', y.shape) if filter_not_found: # added code by wohlg new_X = np.delete(X, word_pair_oov_indices, 0) # print(new_X) new_y = np.delete(y, word_pair_oov_indices) print('new_X.shape', new_X.shape) print('new_y.shape', new_y.shape) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in new_X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in new_X[:, 1]) print(len(A), len(B)) print(type(A), type(B)) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) y = new_y pairs = new_X else: # orig code mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) pairs = X # alexpulich / wohlg: if include_structured_sources == 'wn': wn_scores, structed_oov_pairs = compute_wordnet_path_scores(pairs) ## wordnet_method1 or wordnet_method2: currently hardcoded, can be refactored if needed :) scores = wordnet_method1(list(scores), pairs, wn_scores, structed_sources_coef) # scores = wordnet_method2(list(scores), pairs, wn_scores, structed_sources_coef) elif include_structured_sources == 'cn': #should comment, if don't want to use numberbatch cn_scores, structed_oov_pairs = compute_conceptnet_path_scores( pairs, numberbatch) scores = conceptnet_method1(list(scores), pairs, cn_scores, structed_sources_coef) # scores = conceptnet_method2(list(scores), pairs, cn_scores, structed_sources_coef) # wohlg: original version only returned Spearman # wohlg: we added Pearson and other information result = { 'spearmanr': scipy.stats.spearmanr(scores, y).correlation, 'pearsonr': scipy.stats.pearsonr(scores, y)[0], 'num_oov_word_pairs': len(word_pair_oov_indices), 'num_found_words': found_words, 'num_missing_words': missing_words, 'num_oov_created': oov_vecs_created, 'y.shape': y.shape } if include_structured_sources: result['structed_oov_pairs'] = structed_oov_pairs return result
def evaluate_on_all(w): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "WS353R": fetch_WS353(which="relatedness"), "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "RW": fetch_RW(), "RG65": fetch_RG65(), "MTurk": fetch_MTurk(), } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format( name, similarity_results[name])) # Calculate results on analogy logger.info("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) logger.info("Analogy prediction accuracy on {} {}".format( name, analogy_results[name])) analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format( "SemEval2012", analogy_results["SemEval2012_2"])) # Calculate results on categorization logger.info("Calculating categorization benchmarks") categorization_tasks = { "AP": fetch_AP(), "BLESS": fetch_BLESS(), "Battig": fetch_battig(), "ESSLI_2c": fetch_ESSLI_2c(), "ESSLI_2b": fetch_ESSLI_2b(), "ESSLI_1a": fetch_ESSLI_1a() } categorization_results = {} # Calculate results using helper function for name, data in iteritems(categorization_tasks): categorization_results[name] = evaluate_categorization( w, data.X, data.y) logger.info("Cluster purity on {} {}".format( name, categorization_results[name])) # Construct pd table cat = pd.DataFrame([categorization_results]) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = cat.join(sim).join(analogy) return results
def evaluate_categorization(w, X, y, method="all", seed=None): """ Evaluate embeddings on categorization task. Parameters ---------- w: Embedding or dict Embedding to test. X: vector, shape: (n_samples, ) Vector of words. y: vector, shape: (n_samples, ) Vector of cluster assignments. method: string, default: "all" What method to use. Possible values are "agglomerative", "kmeans", "all. If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude hyperparameter tuning to avoid overfitting). If "kmeans" is passed, method will fit KMeans. In both cases number of clusters is preset to the correct value. seed: int, default: None Seed passed to KMeans. Returns ------- purity: float Purity of the best obtained clustering. Notes ----- KMedoids method was excluded as empirically didn't improve over KMeans (for categorization tasks available in the package). """ if isinstance(w, dict): w = Embedding.from_dict(w) assert method in ["all", "kmeans", "agglomerative", "mean-shift", "spectral", "affinityPropagation", "birch"], "Uncrecognized method" mean_vector = np.mean(w.vectors, axis=0, keepdims=True) words = np.vstack(w.get(word, mean_vector) for word in X.flatten()) ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False) # Evaluate clustering on several hyperparameters of AgglomerativeClustering and # KMeans best_purity = 0 if method == "all" or method == "agglomerative": best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity="euclidean", linkage="ward").fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward')) for affinity in ["cosine", "euclidean"]: for linkage in ["average", "complete"]: purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity=affinity, linkage=linkage).fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage)) best_purity = max(best_purity, purity) if method == "all" or method == "kmeans": purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))). fit_predict(words[ids])) logger.debug("Purity={:.3f} using KMeans".format(purity)) best_purity = max(purity, best_purity) if method == "all" or method == "mean-shift": # # # If takes too long: Note that the estimate_bandwidth function # # # is much less scalable than the mean shift algorithm # # # and will be the bottleneck if it is used. purity = calculate_purity(y[ids], MeanShift(bin_seeding = True, n_jobs=5).fit_predict(words[ids])) logger.debug("Purity={:.3f} using MeanShift".format(purity)) best_purity = max(purity, best_purity) if method == "all" or method == "spectral": for affinity in ['nearest_neighbors', 'rbf']: purity = calculate_purity(y[ids], SpectralClustering(n_clusters=len(set(y)), affinity=affinity, random_state=seed, n_jobs=5).fit_predict(words[ids])) logger.debug("Purity={:.3f} using SpectralClustering affinity={}".format(purity, affinity)) best_purity = max(purity, best_purity) if method == "all" or method == "affinityPropagation": # preference : array-like, shape (n_samples,) or float, optional # # Preferences for each point - points with larger values of preferences are more # likely to be chosen as exemplars. The number of exemplars, ie of clusters, is # influenced by the input preferences value. If the preferences are not passed as # arguments, they will be set to the median of the input similarities. for affinity in ["cosine", "euclidean"]: purity = calculate_purity(y[ids], AffinityPropagation(preference=None, affinity=affinity).fit_predict(words[ids])) logger.debug("Purity={:.3f} using Affinity Propagation".format(purity)) best_purity = max(purity, best_purity) if method == "all" or method == "birch": purity = calculate_purity(y[ids], Birch(threshold=0.5, branching_factor=50, n_clusters=len(set(y))).fit_predict(words[ids])) logger.debug("Purity={:.3f} using Birch".format(purity)) best_purity = max(purity, best_purity) return best_purity
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}): """ Evaluate on WordRep dataset Parameters ---------- w : Embedding or dict Embedding or dict instance. max_pairs: int, default: 1000 Each category will be constrained to maximum of max_pairs pairs (which results in max_pair * (max_pairs - 1) examples) solver_kwargs: dict, default: {} Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words in the dictionary. References ---------- Bin Gao, Jiang Bian, Tie-Yan Liu (2015) "WordRep: A Benchmark for Research on Learning Word Representations" """ if isinstance(w, dict): w = Embedding.from_dict(w) data = fetch_wordrep() categories = set(data.category) accuracy = {} correct = {} count = {} for cat in categories: X_cat = data.X[data.category == cat] X_cat = X_cat[0:max_pairs] logger.info("Processing {} with {} pairs, {} questions".format( cat, X_cat.shape[0], X_cat.shape[0] * (X_cat.shape[0] - 1))) # For each category construct question-answer pairs size = X_cat.shape[0] * (X_cat.shape[0] - 1) X = np.zeros(shape=(size, 3), dtype="object") y = np.zeros(shape=(size, ), dtype="object") id = 0 for left, right in product(X_cat, X_cat): if not np.array_equal(left, right): X[id, 0:2] = left X[id, 2] = right[0] y[id] = right[1] id += 1 # Run solver solver = SimpleAnalogySolver(w=w, **solver_kwargs) y_pred = solver.predict(X) correct[cat] = float(np.sum(y_pred == y)) count[cat] = size accuracy[cat] = float(np.sum(y_pred == y)) / size # Add summary results correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories) correct['all'] = sum(correct[c] for c in categories) correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories) count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories) count['all'] = sum(count[c] for c in categories) count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories) accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia'] accuracy['all'] = correct['all'] / count['all'] accuracy['wordnet'] = correct['wordnet'] / count['wordnet'] return pd.concat([ pd.Series(accuracy, name="accuracy"), pd.Series(correct, name="correct"), pd.Series(count, name="count") ], axis=1)
def evaluate_similarity(w, X, y, tokenize_oov_words_with_deepcut=False, filter_not_found=False): """ Calculate Spearman correlation between cosine similarity of the model and human rated similarity of word pairs Parameters ---------- w : Embedding or dict Embedding or dict instance. X: array, shape: (n_samples, 2) Word pairs y: vector, shape: (n_samples,) Human ratings tokenize_oov_words_with_deepcut: if a thai word is not found in the embedding (OOV), tokenize it with deepcut, and try to use the sum vector of its parts? filter_not_found: remove a word pair if one of the words was not found in the embedding vocabulary Returns ------- cor: float Spearman correlation """ if isinstance(w, dict): w = Embedding.from_dict(w) missing_words, found_words, oov_vecs_created, index = 0, 0, 0, 0 word_pair_oov_indices = [] info_oov_words = {} info_created_words = {} words = w.vocabulary.word_id ## NEW: use deepcut to create word vectors of word parts -- if possible if tokenize_oov_words_with_deepcut: # a) create set of OOV words in the dataset oov_words = set() for query in X: for query_word in query: if query_word not in words: oov_words.add(query_word) # b) iterate over OOV words and see if we can set a vector from them for ds_word in oov_words: tokens = deepcut.tokenize(ds_word) in_voc_tokens = [tok for tok in tokens if tok in w] ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word if in_voc_tokens: token_vecs = [w.get(t) for t in in_voc_tokens] w[ds_word] = np.mean(token_vecs, axis=0) #print("Created vector for OOV word:", ds_word) oov_vecs_created += 1 info_created_words[ds_word] = in_voc_tokens else: info_oov_words[ds_word] = tokens print('All OOV words after deepcut:') pprint(info_oov_words) print('All "created"/replaced words by deepcut:') pprint(info_created_words) ## For all words in the datasets, check if the are OOV? ## Indices of word-pairs with a OOV word are stored in word_pair_oov_indices for query in X: for query_word in query: if query_word not in words: print("Missing Word:", query_word) missing_words += 1 word_pair_oov_indices.append(index) else: print("Found Word:", query_word) found_words += 1 index += 1 word_pair_oov_indices = list(set(word_pair_oov_indices)) print('word_pair_oov_indices', word_pair_oov_indices) if missing_words > 0 or oov_vecs_created > 0: logger.warning( "Missing {} words. Will replace them with mean vector".format( missing_words)) logger.warning( "OOV words {} created from their subwords. Will replace them with mean vector of sub-tokens" .format(oov_vecs_created)) logger.warning("Found {} words.".format(found_words)) print('X.shape', X.shape) print('y.shape', y.shape) if filter_not_found: # added code by wohlg new_X = np.delete(X, word_pair_oov_indices, 0) #print(new_X) new_y = np.delete(y, word_pair_oov_indices) print('new_X.shape', new_X.shape) print('new_y.shape', new_y.shape) mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in new_X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in new_X[:, 1]) print(len(A), len(B)) print(type(A), type(B)) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) y = new_y else: # orig code mean_vector = np.mean(w.vectors, axis=0, keepdims=True) A = np.vstack(w.get(word, mean_vector) for word in X[:, 0]) B = np.vstack(w.get(word, mean_vector) for word in X[:, 1]) scores = np.array([ v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2)) for v1, v2 in zip(A, B) ]) ## insert new code here # print(X) # print(scores) # print(scores.shape) # import sys; sys.exit() # if include_structured_sources: # given as argv when calling the script? # scores = add_structured_info(X, scores) # call to function outside # wohlg: original version only returned Spearman # wohlg: we added Pearson and other information result = { 'spearmanr': scipy.stats.spearmanr(scores, y).correlation, 'pearsonr': scipy.stats.pearsonr(scores, y)[0], 'num_oov_word_pairs': len(word_pair_oov_indices), 'num_found_words': found_words, 'num_missing_words': missing_words, 'num_oov_created': oov_vecs_created, 'y.shape': y.shape } return result
def evaluate_on_all(w,word_embedding_name): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "Rubenstein_and_Goodenough": fetch_RG65(), "Rare_Words": fetch_RW(), "SimLex999": fetch_SimLex999(), "TR9856": fetch_TR9856(), } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy logger.info("Calculating analogy benchmarks") analogy_tasks = { "Google_analogy": fetch_google_analogy(), "MSR": fetch_msr_analogy(), # "SEMEVAL 2012 Task 2" } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["MSR_WordRep"] = evaluate_on_WordRep(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format("MSR_WordRep", analogy_results["MSR_WordRep"])) analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format("SemEval_2012_Task_2", analogy_results["SemEval2012_2"])) # # Calculate results on categorization # logger.info("Calculating categorization benchmarks") # categorization_tasks = { # "AP": fetch_AP(), # "BLESS": fetch_BLESS(), # "Battig": fetch_battig(), # "ESSLI_2c": fetch_ESSLI_2c(), # "ESSLI_2b": fetch_ESSLI_2b(), # "ESSLI_1a": fetch_ESSLI_1a() # } # categorization_results = {} # # Calculate results using helper function # for name, data in iteritems(categorization_tasks): # categorization_results[name] = evaluate_categorization(w, data.X, data.y) # logger.info("Cluster purity on {} {}".format(name, categorization_results[name])) # # Construct pd table # cat = pd.DataFrame([categorization_results]) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) # results = cat.join(sim).join(analogy) w_name = {"word_embedding": word_embedding_name} w_name = pd.DataFrame([w_name]) results = w_name.join(sim).join(analogy) # results = sim.join(analogy) return results
def evaluate_on_all(w): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "WS353R": fetch_WS353(which="relatedness"), "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "RW": fetch_RW(), "RG65": fetch_RG65(), "MTurk": fetch_MTurk(), } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy logger.info("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) # Calculate results on categorization logger.info("Calculating categorization benchmarks") categorization_tasks = { "AP": fetch_AP(), "BLESS": fetch_BLESS(), "Battig": fetch_battig(), "ESSLI_2c": fetch_ESSLI_2c(), "ESSLI_2b": fetch_ESSLI_2b(), "ESSLI_1a": fetch_ESSLI_1a() } categorization_results = {} # Calculate results using helper function for name, data in iteritems(categorization_tasks): categorization_results[name] = evaluate_categorization(w, data.X, data.y) logger.info("Cluster purity on {} {}".format(name, categorization_results[name])) # Construct pd table cat = pd.DataFrame([categorization_results]) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = cat.join(sim).join(analogy) return results
def evaluate_categorization(w, X, y, method="all", seed=None): """ Evaluate embeddings on categorization task. Parameters ---------- w: Embedding or dict Embedding to test. X: vector, shape: (n_samples, ) Vector of words. y: vector, shape: (n_samples, ) Vector of cluster assignments. method: string, default: "all" What method to use. Possible values are "agglomerative", "kmeans", "all. If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude hyperparameter tuning to avoid overfitting). If "kmeans" is passed, method will fit KMeans. In both cases number of clusters is preset to the correct value. seed: int, default: None Seed passed to KMeans. Returns ------- purity: float Purity of the best obtained clustering. Notes ----- KMedoids method was excluded as empirically didn't improve over KMeans (for categorization tasks available in the package). """ if isinstance(w, dict): w = Embedding.from_dict(w) assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method" mean_vector = np.mean(w.vectors, axis=0, keepdims=True) w.oov = 0 words = np.vstack(w.get(word, mean_vector) for word in X.flatten()) print('{} oov words out of {}'.format(w.oov, len(X.flatten()))) ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False) # Evaluate clustering on several hyperparameters of AgglomerativeClustering and # KMeans best_purity = 0 if method == "all" or method == "agglomerative": best_purity = calculate_purity( y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity="euclidean", linkage="ward").fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format( best_purity, 'euclidean', 'ward')) for affinity in ["cosine", "euclidean"]: for linkage in ["average", "complete"]: purity = calculate_purity( y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity=affinity, linkage=linkage).fit_predict( words[ids])) logger.debug( "Purity={:.3f} using affinity={} linkage={}".format( purity, affinity, linkage)) best_purity = max(best_purity, purity) if method == "all" or method == "kmeans": purity = calculate_purity( y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).fit_predict(words[ids])) logger.debug("Purity={:.3f} using KMeans".format(purity)) best_purity = max(purity, best_purity) return best_purity
def evaluate_categorization(w, X, y, method="all", seed=None): """ Evaluate embeddings on categorization task. Parameters ---------- w: Embedding or dict Embedding to test. X: vector, shape: (n_samples, ) Vector of words. y: vector, shape: (n_samples, ) Vector of cluster assignments. method: string, default: "all" What method to use. Possible values are "agglomerative", "kmeans", "all. If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude hyperparameter tuning to avoid overfitting). If "kmeans" is passed, method will fit KMeans. In both cases number of clusters is preset to the correct value. seed: int, default: None Seed passed to KMeans. Returns ------- purity: float Purity of the best obtained clustering. Notes ----- KMedoids method was excluded as empirically didn't improve over KMeans (for categorization tasks available in the package). """ if isinstance(w, dict): w = Embedding.from_dict(w) assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method" mean_vector = np.mean(w.vectors, axis=0, keepdims=True) w.oov = 0 words = np.vstack(w.get(word, mean_vector) for word in X.flatten()) print ('{} oov words out of {}'.format(w.oov, len(X.flatten()))) ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False) # Evaluate clustering on several hyperparameters of AgglomerativeClustering and # KMeans best_purity = 0 if method == "all" or method == "agglomerative": best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity="euclidean", linkage="ward").fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward')) for affinity in ["cosine", "euclidean"]: for linkage in ["average", "complete"]: purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)), affinity=affinity, linkage=linkage).fit_predict(words[ids])) logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage)) best_purity = max(best_purity, purity) if method == "all" or method == "kmeans": purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))). fit_predict(words[ids])) logger.debug("Purity={:.3f} using KMeans".format(purity)) best_purity = max(purity, best_purity) return best_purity
def evaluate_on_all(w, entity_benchmark=False, fastText_ML=False): """ Evaluate Embedding on all fast-running benchmarks Parameters ---------- w: Embedding or dict Embedding to evaluate. Returns ------- results: pandas.DataFrame DataFrame with results, one per column. """ if isinstance(w, dict): w = Embedding.from_dict(w) # Calculate results on similarity logger.info("Calculating similarity benchmarks") similarity_tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "WS353R": fetch_WS353(which="relatedness"), "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "RW": fetch_RW(), "RG65": fetch_RG65(), "MTurk": fetch_MTurk() # "KORE": fetch_Core() } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) logger.info("Spearman correlation of scores on {} {}".format( name, similarity_results[name])) # Calculate results on analogy if fastText_ML == True: logger.info( "Calculating analogy benchmarks with multilingual analogy tasks") analogy_tasks = { # TODO: Add Cs, Zh, Dem, It. "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy(), "Fi": fetch_finish_analogy(), "Cs": fetch_czech_analogy(), "Zh": fetch_chinese_analogy(), "De": fetch_german_analogy(), "Es": fetch_spanish_analogy(), "Pt": fetch_portuguese_analogy(), "It": fetch_italy_analogy() } else: logger.info("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) logger.info("Analogy prediction accuracy on {} {}".format( name, analogy_results[name])) analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all'] logger.info("Analogy prediction accuracy on {} {}".format( "SemEval2012", analogy_results["SemEval2012_2"])) # Calculate results on categorization logger.info("Calculating categorization benchmarks") categorization_tasks = { "AP": fetch_AP(), "BLESS": fetch_BLESS(), "Battig": fetch_battig(), "ESSLI_2c": fetch_ESSLI_2c(), "ESSLI_2b": fetch_ESSLI_2b(), "ESSLI_1a": fetch_ESSLI_1a() } categorization_results = {} # Calculate results using helper function for name, data in iteritems(categorization_tasks): categorization_results[name] = evaluate_categorization( w, data.X, data.y) logger.info("Cluster purity on {} {}".format( name, categorization_results[name])) # Construct pd table cat = pd.DataFrame([categorization_results]) print(cat) analogy = pd.DataFrame([analogy_results]) print(analogy) sim = pd.DataFrame([similarity_results]) print(sim) results = cat.join(sim).join(analogy) # Add Kore Evaluation result if entity_benchmark is True. if entity_benchmark == True: kore_results = evaluate_on_Kore(w) kore = pd.DataFrame([kore_results]) results = results.join(kore) return results