示例#1
0
def evaluate_simi(wv, w2i, vocab):
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on similarity
    print("Calculating similarity benchmarks")
    similarity_tasks = {
        "WS353": fetch_WS353(),
        "RG65": fetch_RG65(),
        #         "WS353R": fetch_WS353(which="relatedness"),
        #         "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "MTurk": fetch_MTurk(),
        "RW": fetch_RW(),
        "MEN": fetch_MEN(),
    }

    #     similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        print(
            "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}"
            .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
        score = evaluate_similarity(w, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, score))
示例#2
0
def web_tests(emb):
    """
    :param emb: dict of words and their corresponding embeddings
    :return: dict of word-embeddings-benchmarks tests and scores received
    """
    similarity_tasks = {
        'WS353': fetch_WS353(),
        'RG65': fetch_RG65(),
        'RW': fetch_RW(),
        'MTurk': fetch_MTurk(),
        'MEN': fetch_MEN(),
        'SimLex999': fetch_SimLex999()
    }

    web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values()))
    similarity_results = {}
    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y)
        logging.info("Spearman correlation of scores on {} {}".format(
            name, evaluate_similarity(web_emb, data.X, data.y)))
    return similarity_results
示例#3
0
def get_dataset(dataset_name):
    if dataset_name == "WS353":
        return fetch_WS353("similarity")
    elif dataset_name == "MEN":
        return fetch_MEN("all")
    elif dataset_name == "SimLex-999":
        return fetch_SimLex999()
    elif dataset_name == "MTurk":
        return fetch_MTurk()
    elif dataset_name == "WS353":
        return fetch_WS353('all')
    elif dataset_name == "RG65":
        return fetch_RG65()
    elif dataset_name == "RW":
        return fetch_RW()
    elif dataset_name == "TR9856":
        return fetch_TR9856()
    elif dataset_name == "MSR":
        return fetch_msr_analogy()
    elif dataset_name == "Google":
        return fetch_google_analogy()
    else:
        raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w):
  similarity_tasks = {
      "MTurk": fetch_MTurk(),
      "MEN": fetch_MEN(),
      "WS353": fetch_WS353(),
      "RubensteinAndGoodenough": fetch_RG65(),
      "Rare Words": fetch_RW(),
      "SIMLEX999": fetch_SimLex999(),
      "TR9856": fetch_TR9856()
    }

  similarity_results = {}
  
  for name, data in iteritems(similarity_tasks):
    similarity_results[name] = evaluate_similarity(w, data.X, data.y)
    print("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
  
  # Calculate results on analogy
  print("Calculating analogy benchmarks")
  analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
  }
  analogy_results = {}
  for name, data in iteritems(analogy_tasks):
    analogy_results[name] = evaluate_analogy(w, data.X, data.y)
    print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
  
  analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all']
  print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

  analogy = pd.DataFrame([analogy_results])
  sim = pd.DataFrame([similarity_results])
  results = sim.join(analogy)

  return results
示例#5
0
            res = results[id] 
        else:
            results[id] = {}
            res = results[id] 


        #w_embedding = fetch_Mine(id, format="dict", normalize=False, lower=False, clean_words=False)
        w_embedding = fetch_Mine(id, format="csr", normalize=False, lower=False, clean_words=False)
        #w_embedding = fetch_GloVe(corpus="wiki-6B", dim=300)

        
        print('{}'.format(' '.join(['-' for x in range(30)])))
        # SIMILARITY
        similarity_results = {}
        similarity_tasks = {
            "RG65": fetch_RG65(),
            #"MEN": fetch_MEN(),    
            #"WS353": fetch_WS353(),
            #"WS353R": fetch_WS353(which="relatedness"),
            #"WS353S": fetch_WS353(which="similarity"),
            #"SimLex999": fetch_SimLex999(),
            #"MTurk": fetch_MTurk(),

            #"multilingual_SimLex999": fetch_multilingual_SimLex999(),
            #"RW": fetch_RW(),
        }
        
        for name, data in similarity_tasks.items():
            similarity_results[name] = evaluate_similarity(w_embedding, data.X, data.y)
            print ("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
            res[name] = similarity_results[name]
def test_RG65_fetcher():
    data = fetch_RG65()
    assert (len(data.y) == len(data.X) == 65)
    assert (10.0 >= data.y.max() >= 9.8)
示例#7
0
def test_RG65_fetcher():
    data = fetch_RG65()
    assert (len(data.y) == len(data.X) == 65)
    assert (10.0 >= data.y.max() >= 9.8)
  for name, data in iteritems(tasks):
    print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_data, data.X, data.y)))

"""### Similarity Tasks"""

import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
from web.embeddings import fetch_GloVe, fetch_static_dataset, fetch_PDC, fetch_LexVec, fetch_HDC, fetch_conceptnet_numberbatch, fetch_FastText
from web.evaluate import evaluate_similarity

tasks = {
    "MTurk": fetch_MTurk(),
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "RubensteinAndGoodenough": fetch_RG65(),
    "Rare Words": fetch_RW(),
    "SIMLEX999": fetch_SimLex999(),
    "TR9856": fetch_TR9856()
}

"""##### Fetch all word embedding models"""

wordEmbeddingPath = '/content/drive/My Drive/Term5/NLP/Dataset/WebEmbeddings'

#WE:1 GloVe from GitHub code on wiki corpus
#Corpus:Wiki
#Vocabulary Size:#
#Dimension: 300
w_gv_1 = fetch_GloVe(corpus="wiki-6B", dim=300)