Пример #1
0
def evaluate_simi(wv, w2i, vocab):
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on similarity
    print("Calculating similarity benchmarks")
    similarity_tasks = {
        "WS353": fetch_WS353(),
        "RG65": fetch_RG65(),
        #         "WS353R": fetch_WS353(which="relatedness"),
        #         "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "MTurk": fetch_MTurk(),
        "RW": fetch_RW(),
        "MEN": fetch_MEN(),
    }

    #     similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        print(
            "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}"
            .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
        score = evaluate_similarity(w, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, score))
Пример #2
0
    def __init__(self, embedder, prefix="", **kwargs):
        try:
            from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_RW
        except ImportError:
            raise RuntimeError(
                "Please install web (https://github.com/kudkudak/word-embeddings-benchmarks)"
            )

        self._embedder = embedder
        self._prefix = prefix

        # Define tasks
        logger.info("Downloading benchmark data")
        tasks = {  # TODO: Pick a bit better tasks
            "MEN": fetch_MEN(),
            "WS353": fetch_WS353(),
            "SIMLEX999": fetch_SimLex999(),
            "RW": fetch_RW()
        }

        # Print sample data
        for name, data in iteritems(tasks):
            logger.info(
                "Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}"
                .format(name, data.X[0][0], data.X[0][1], data.y[0]))

        logger.info("Checking embedder for " + prefix)
        logger.info(embedder(["love"])[0, 0:5])  # Test embedder

        self._tasks = tasks

        super(SimilarityWordEmbeddingEval, self).__init__(**kwargs)
Пример #3
0
def web_tests(emb):
    """
    :param emb: dict of words and their corresponding embeddings
    :return: dict of word-embeddings-benchmarks tests and scores received
    """
    similarity_tasks = {
        'WS353': fetch_WS353(),
        'RG65': fetch_RG65(),
        'RW': fetch_RW(),
        'MTurk': fetch_MTurk(),
        'MEN': fetch_MEN(),
        'SimLex999': fetch_SimLex999()
    }

    web_emb = Embedding(Vocabulary(list(emb.keys())), list(emb.values()))
    similarity_results = {}
    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(web_emb, data.X, data.y)
        logging.info("Spearman correlation of scores on {} {}".format(
            name, evaluate_similarity(web_emb, data.X, data.y)))
    return similarity_results
Пример #4
0
def get_dataset(dataset_name):
    if dataset_name == "WS353":
        return fetch_WS353("similarity")
    elif dataset_name == "MEN":
        return fetch_MEN("all")
    elif dataset_name == "SimLex-999":
        return fetch_SimLex999()
    elif dataset_name == "MTurk":
        return fetch_MTurk()
    elif dataset_name == "WS353":
        return fetch_WS353('all')
    elif dataset_name == "RG65":
        return fetch_RG65()
    elif dataset_name == "RW":
        return fetch_RW()
    elif dataset_name == "TR9856":
        return fetch_TR9856()
    elif dataset_name == "MSR":
        return fetch_msr_analogy()
    elif dataset_name == "Google":
        return fetch_google_analogy()
    else:
        raise Exception("{}: dataset not supported".format(dataset_name))
def evaluateOnAll(w):
  similarity_tasks = {
      "MTurk": fetch_MTurk(),
      "MEN": fetch_MEN(),
      "WS353": fetch_WS353(),
      "RubensteinAndGoodenough": fetch_RG65(),
      "Rare Words": fetch_RW(),
      "SIMLEX999": fetch_SimLex999(),
      "TR9856": fetch_TR9856()
    }

  similarity_results = {}
  
  for name, data in iteritems(similarity_tasks):
    similarity_results[name] = evaluate_similarity(w, data.X, data.y)
    print("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))
  
  # Calculate results on analogy
  print("Calculating analogy benchmarks")
  analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
  }
  analogy_results = {}
  for name, data in iteritems(analogy_tasks):
    analogy_results[name] = evaluate_analogy(w, data.X, data.y)
    print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
  
  analogy_results["SemEval2012_2"] = calAnswersonSemEval(w)['all']
  print("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

  analogy = pd.DataFrame([analogy_results])
  sim = pd.DataFrame([similarity_results])
  results = sim.join(analogy)

  return results
def test_RW_fetcher():
    data = fetch_RW()
    assert (len(data.y) == len(data.X) == 2034)
    assert (10.0 >= data.y.max() >= 9.8)
Пример #7
0
#w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
#w_PDC = fetch_PDC()
#w_HDC = fetch_HDC()
#w_LexVec = fetch_LexVec(which="wikipedia+newscrawl-W")
#w_conceptnet_numberbatch = fetch_conceptnet_numberbatch()
#w_w2v = fetch_SG_GoogleNews()
#w_fastText = fetch_FastText()  #load_embedding(path, format='word2vec', normalize=True, lower=False, clean_words=False)


# Define tasks
tasks = {
    "MTurk": fetch_MTurk(),
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "RG65":fetch_RG65(),
    "RW":fetch_RW(),
    "SIMLEX999": fetch_SimLex999(),
    "TR9856":fetch_TR9856()
}

result =   np.zeros((7,7))
# Print sample data
#for name, data in iteritems(tasks):
#    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
i = 0
for m_name, m_fun in iteritems(models):   
    j = 0
    try:
        model = eval(m_fun)
Пример #8
0
def test_RW_fetcher():
    data = fetch_RW()
    assert (len(data.y) == len(data.X) == 2034)
    assert (10.0 >= data.y.max() >= 9.8)
    print ("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_data, data.X, data.y)))

"""### Similarity Tasks"""

import logging
from six import iteritems
from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999, fetch_MTurk, fetch_RG65, fetch_RW, fetch_TR9856
from web.embeddings import fetch_GloVe, fetch_static_dataset, fetch_PDC, fetch_LexVec, fetch_HDC, fetch_conceptnet_numberbatch, fetch_FastText
from web.evaluate import evaluate_similarity

tasks = {
    "MTurk": fetch_MTurk(),
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "RubensteinAndGoodenough": fetch_RG65(),
    "Rare Words": fetch_RW(),
    "SIMLEX999": fetch_SimLex999(),
    "TR9856": fetch_TR9856()
}

"""##### Fetch all word embedding models"""

wordEmbeddingPath = '/content/drive/My Drive/Term5/NLP/Dataset/WebEmbeddings'

#WE:1 GloVe from GitHub code on wiki corpus
#Corpus:Wiki
#Vocabulary Size:#
#Dimension: 300
w_gv_1 = fetch_GloVe(corpus="wiki-6B", dim=300)

#WE:2 Analysis on Gensim continuous Skipgram