def evaluate_simi(wv, w2i, vocab): wv_dict = dict() for w in vocab: wv_dict[w] = wv[w2i[w], :] if isinstance(wv_dict, dict): w = Embedding.from_dict(wv_dict) # Calculate results on similarity print("Calculating similarity benchmarks") similarity_tasks = { "WS353": fetch_WS353(), "RG65": fetch_RG65(), # "WS353R": fetch_WS353(which="relatedness"), # "WS353S": fetch_WS353(which="similarity"), "SimLex999": fetch_SimLex999(), "MTurk": fetch_MTurk(), "RW": fetch_RW(), "MEN": fetch_MEN(), } # similarity_results = {} for name, data in iteritems(similarity_tasks): print( "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}" .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0])) score = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, score))
def web_tests(emb): """ :param emb: dict of words and their corresponding embeddings :return: dict of word-embeddings-benchmarks tests and scores received """ similarity_tasks = { 'WS353': fetch_WS353(), 'RG65': fetch_RG65(), 'RW': fetch_RW(), 'MTurk': fetch_MTurk(), 'MEN': fetch_MEN(), 'SimLex999': fetch_SimLex999() } web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values())) similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y) logging.info("Spearman correlation of scores on {} {}".format( name, evaluate_similarity(web_emb, data.X, data.y))) return similarity_results
def get_dataset(dataset_name): if dataset_name == "WS353": return fetch_WS353("similarity") elif dataset_name == "MEN": return fetch_MEN("all") elif dataset_name == "SimLex-999": return fetch_SimLex999() elif dataset_name == "MTurk": return fetch_MTurk() elif dataset_name == "WS353": return fetch_WS353('all') elif dataset_name == "RG65": return fetch_RG65() elif dataset_name == "RW": return fetch_RW() elif dataset_name == "TR9856": return fetch_TR9856() elif dataset_name == "MSR": return fetch_msr_analogy() elif dataset_name == "Google": return fetch_google_analogy() else: raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w): similarity_tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RubensteinAndGoodenough": fetch_RG65(), "Rare Words": fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856": fetch_TR9856() } similarity_results = {} for name, data in iteritems(similarity_tasks): similarity_results[name] = evaluate_similarity(w, data.X, data.y) print("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) # Calculate results on analogy print("Calculating analogy benchmarks") analogy_tasks = { "Google": fetch_google_analogy(), "MSR": fetch_msr_analogy() } analogy_results = {} for name, data in iteritems(analogy_tasks): analogy_results[name] = evaluate_analogy(w, data.X, data.y) print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name])) analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all'] print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"])) analogy = pd.DataFrame([analogy_results]) sim = pd.DataFrame([similarity_results]) results = sim.join(analogy) return results
res = results[id] else: results[id] = {} res = results[id] #w_embedding = fetch_Mine(id, format="dict", normalize=False, lower=False, clean_words=False) w_embedding = fetch_Mine(id, format="csr", normalize=False, lower=False, clean_words=False) #w_embedding = fetch_GloVe(corpus="wiki-6B", dim=300) print('{}'.format(' '.join(['-' for x in range(30)]))) # SIMILARITY similarity_results = {} similarity_tasks = { "RG65": fetch_RG65(), #"MEN": fetch_MEN(), #"WS353": fetch_WS353(), #"WS353R": fetch_WS353(which="relatedness"), #"WS353S": fetch_WS353(which="similarity"), #"SimLex999": fetch_SimLex999(), #"MTurk": fetch_MTurk(), #"multilingual_SimLex999": fetch_multilingual_SimLex999(), #"RW": fetch_RW(), } for name, data in similarity_tasks.items(): similarity_results[name] = evaluate_similarity(w_embedding, data.X, data.y) print ("Spearman correlation of scores on {} {}".format(name, similarity_results[name])) res[name] = similarity_results[name]
def test_RG65_fetcher(): data = fetch_RG65() assert (len(data.y) == len(data.X) == 65) assert (10.0 >= data.y.max() >= 9.8)
for name, data in iteritems(tasks): print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_data, data.X, data.y))) """### Similarity Tasks""" import logging from six import iteritems from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856 from web.embeddings import fetch_GloVe, fetch_static_dataset, fetch_PDC, fetch_LexVec, fetch_HDC, fetch_conceptnet_numberbatch, fetch_FastText from web.evaluate import evaluate_similarity tasks = { "MTurk": fetch_MTurk(), "MEN": fetch_MEN(), "WS353": fetch_WS353(), "RubensteinAndGoodenough": fetch_RG65(), "Rare Words": fetch_RW(), "SIMLEX999": fetch_SimLex999(), "TR9856": fetch_TR9856() } """##### Fetch all word embedding models""" wordEmbeddingPath = '/content/drive/My Drive/Term5/NLP/Dataset/WebEmbeddings' #WE:1 GloVe from GitHub code on wiki corpus #Corpus:Wiki #Vocabulary Size:# #Dimension: 300 w_gv_1 = fetch_GloVe(corpus="wiki-6B", dim=300)