def evaluate_simi(wv, w2i, vocab): wv_dict = dict() for w in vocab: wv_dict[w] = wv[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) # Calculate results on similarity print("Calculating similarity benchmarks") similarity_tasks = { "WS353": fetch_WS353(), "RG65": fetch_RG65(), # "WS353R": fetch_WS353(which="relatedness"), # "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "MTurk": fetch_MTurk(), "RW": fetch_RW(), "MEN": fetch_MEN(), } # similarity_results = {} for name, data in iteritems(similarity_tasks): print( "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}" .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0])) score = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, score))
def __init__(self, embedder, prefix="", **kwargs): try: from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_RW except ImportError: raise RuntimeError( "Please install web (https://github.com/kudkudak/word-embeddings-benchmarks)" ) self._embedder = embedder self._prefix = prefix # Define tasks logger.info("Downloading benchmark data") tasks = { # TODO: Pick a bit better tasks "MEN": fetch_MEN(), "WS353": fetch_WS353(), "SIMLEX999": fetch_SimLex999(), "RW": fetch_RW() } # Print sample data for name, data in iteritems(tasks): logger.info( "Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}" .format(name, data.X[0][0], data.X[0][1], data.y[0])) logger.info("Checking embedder for " + prefix) logger.info(embedder(["love"])[0, 0:5]) # Test embedder self._tasks = tasks super(SimilarityWordEmbeddingEval, self).__init__(**kwargs)
def web_tests(emb): """ :param emb: dict of words and their corresponding embeddings :return: dict of word-embeddings-benchmarks tests and scores received """ similarity_tasks = { 'WS353': fetch_WS353(), 'RG65': fetch_RG65(), 'RW': fetch_RW(), 'MTurk': fetch_MTurk(), 'MEN': fetch_MEN(), 'SimLex999': fetch_SimLex999() } web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values())) similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y) logging.info("Spearman correlation of scores on {} {}".format( name, evaluate_similarity(web_emb, data.X, data.y))) return similarity_results
def get_dataset(dataset_name): if dataset_name == "WS353": return fetch_WS353("similarity") elif dataset_name == "MEN": return fetch_MEN("all") elif dataset_name == "SimLex-999": return fetch_SimLex999() elif dataset_name == "MTurk": return fetch_MTurk() elif dataset_name == "WS353": return fetch_WS353('all') elif dataset_name == "RG65": return fetch_RG65() elif dataset_name == "RW": return fetch_RW() elif dataset_name == "TR9856": return fetch_TR9856() elif dataset_name == "MSR": return fetch_msr_analogy() elif dataset_name == "Google": return fetch_google_analogy() else: raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w): similarity_tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RubensteinAndGoodenough": fetch_RG65(), "Rare Words": fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856": fetch_TR9856() } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy print("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all'] print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = sim.join(analogy) return results
def test_RW_fetcher(): data = fetch_RW() assert (len(data.y) == len(data.X) == 2034) assert (10.0 >= data.y.max() >= 9.8)
#w_glove = fetch_GloVe(corpus="wiki-6B", dim=300) #w_PDC = fetch_PDC() #w_HDC = fetch_HDC() #w_LexVec = fetch_LexVec(which="wikipedia+newscrawl-W") #w_conceptnet_numberbatch = fetch_conceptnet_numberbatch() #w_w2v = fetch_SG_GoogleNews() #w_fastText = fetch_FastText() #load_embedding(path, format='word2vec', normalize=True, lower=False, clean_words=False) # Define tasks tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RG65":fetch_RG65(), "RW":fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856":fetch_TR9856() } result = np.zeros((7,7)) # Print sample data #for name, data in iteritems(tasks): # print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0])) # Calculate results using helper function i = 0 for m_name, m_fun in iteritems(models): j = 0 try: model = eval(m_fun)
print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_data, data.X, data.y))) """### Similarity Tasks""" import logging from six import iteritems from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856 from web.embeddings import fetch_GloVe, fetch_static_dataset, fetch_PDC, fetch_LexVec, fetch_HDC, fetch_conceptnet_numberbatch, fetch_FastText from web.evaluate import evaluate_similarity tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RubensteinAndGoodenough": fetch_RG65(), "Rare Words": fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856": fetch_TR9856() } """##### Fetch all word embedding models""" wordEmbeddingPath = '/content/drive/My Drive/Term5/NLP/Dataset/WebEmbeddings' #WE:1 GloVe from GitHub code on wiki corpus #Corpus:Wiki #Vocabulary Size:# #Dimension: 300 w_gv_1 = fetch_GloVe(corpus="wiki-6B", dim=300) #WE:2 Analysis on Gensim continuous Skipgram