def get_vector_pairs(w, X, y, dataset='simlex', save=True):

    if isinstance(w, dict):
        w = list(w.values())
        names = list(w.keys())
        w_source = Embedding.from_dict(w[0])
        w_target = Embedding.from_dict(w[1])

    missing_words = 0
    source_words = w['source'].vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in source_words:
                missing_words += 1
    if missing_words > 0:
        logger.warning("Missing {} source words. Will replace them with mean vector".format(missing_words))

    mean_vector_source = np.mean(w['source'].vectors, axis=0, keepdims=True)
    mean_vector_target = np.mean(w['target'].vectors, axis=0, keepdims=True)

    x = list(set(list(X[:,0])+list(X[:,1])))
    x1 = np.vstack(w['source'].get(word, mean_vector_source) for word in x)
    x2 = np.vstack(w['target'].get(word, mean_vector_target) for word in x)

    if save:
        filename = w['source']+'2'+w['target']+ '_'+dataset
        word2vec = {'source':{},'target':{}}
        save_vectors(filename)

    return (x1, x2)
Пример #2
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    total_words = 0
    missing_words = 0
    words = w.vocabulary.word_id
    for query in X:
        for query_word in query:
            total_words += 1
            if query_word not in words:
                missing_words += 1
    if missing_words > 0:
        logger.info(
            "Missing {} words out of {} total words in test ({}% of words are missing)."
            .format(missing_words, total_words,
                    missing_words / total_words * 100.0))
        #logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words))
    '''
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
    '''
    words = zip(X[:, 0], X[:, 1])
    A = []
    B = []
    new_y = []
    for (w1, w2), score in zip(words, y):
        if w1 in w and w2 in w:
            A.append(w[w1])
            B.append(w[w2])
            new_y.append(score)
    A = np.vstack(A)
    B = np.vstack(B)
    y = np.vstack(new_y)
    assert len(A) == len(B) == len(y)
    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #3
0
	def checkpoint(self, epoch, sess):
		"""
		Computes intrinsic scores for embeddings and dumps the embeddings embeddings

		Parameters
		----------
		epoch:		Current epoch number
		sess:		Tensorflow session object

		Returns
		-------
		"""
		embed_matrix, \
		context_matrix 	= sess.run([self.embed_matrix, self.context_matrix])
		voc2vec 	= {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()}
		embedding 	= Embedding.from_dict(voc2vec)
		results		= evaluate_on_all(embedding)
		results 	= {key: round(val[0], 4) for key, val in results.items()}
		curr_int 	= np.mean(list(results.values()))
		self.logger.info('Current Score: {}'.format(curr_int))

		if curr_int > self.best_int_avg:
			self.logger.info("Saving embedding matrix")
			f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w')
			for id, wrd in self.id2voc.items():
				f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()])))

			self.saver.save(sess=sess, save_path=self.save_path)
			self.best_int_avg = curr_int
Пример #4
0
def test_standardize_preserve_identity():
    d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]}
    w3 = Embedding.from_dict(d)
    w4 = w3.standardize_words(inplace=False, lower=True)
    assert w4['spider'][0] == 1
    w3.standardize_words(inplace=True, lower=True)
    assert w3['spider'][0] == 1
Пример #5
0
def evaluate_ana(wv, w2i, vocab):
    W_norm = np.zeros(wv.shape)
    d = (np.sum(wv**2, 1)**(0.5))
    W_norm = (wv.T / d).T

    evaluate_analogy_msr(W_norm, w2i)
    evaluate_analogy_google(W_norm, w2i)

    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = W_norm[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)
    evaluate_analogy_semeval2012(w)


#     analogy_tasks = {
#         "Google": fetch_google_analogy(),
#         "MSR": fetch_msr_analogy()
#     }

#     analogy_results = {}

#     for name, data in iteritems(analogy_tasks):
#         analogy_results[name] = evaluate_analogy(w, data.X, data.y)
#         print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
Пример #6
0
def evaluate_synonyms(e, problems):

    correct = 0
    total = 0

    #debugging...
    if not e:
        all_words = np.concatenate([[q] + o for q, o, _ in problems])
        e = Embedding.from_dict({w: np.random.random(10) for w in all_words})

    meanvec = np.mean(e.vectors, axis=0)

    # with open('synonyms_test_words', 'a') as testw:
    for question, options, answer in problems:
        # testw.write('\n'.join(options+[question])+'\n')
        if question in e:
            print('question: ' + question)
            print(options)
            q_v = e[question].reshape(1, -1)
            q_ops = np.vstack(
                [e[op] if op in e else meanvec for op in options])
            distances = cdist(q_v, q_ops, metric='cosine')[0]
            selected = np.argsort(distances)[0]
            if selected == answer:
                correct += 1
        total += 1

    score = correct * 1. / total

    return score
Пример #7
0
def evaluate_on_all(w):
    """
    Evaluate Embedding on all fast-running benchmarks

    Parameters
    ----------
    w: Embedding or dict
      Embedding to evaluate.

    Returns
    -------
    results: pandas.DataFrame
      DataFrame with results, one per column.
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {"WS353": fetch_WS353()}

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(
            name, similarity_results[name]))

    sim = pd.DataFrame([similarity_results])
    results = sim

    return results
Пример #8
0
def evaluate_similarity(w, X, y, restrict_to_words=None):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """

    from web.embedding import Embedding

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #9
0
def evaluate_simi(wv, w2i, vocab):
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on similarity
    print("Calculating similarity benchmarks")
    similarity_tasks = {
        "WS353": fetch_WS353(),
        "RG65": fetch_RG65(),
        #         "WS353R": fetch_WS353(which="relatedness"),
        #         "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "MTurk": fetch_MTurk(),
        "RW": fetch_RW(),
        "MEN": fetch_MEN(),
    }

    #     similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        print(
            "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}"
            .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
        score = evaluate_similarity(w, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, score))
Пример #10
0
def test_standardize_preserve_identity():
    d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]}
    w3 = Embedding.from_dict(d)
    w4 = w3.standardize_words(inplace=False, lower=True)
    assert w4['spider'][0] == 1
    w3.standardize_words(inplace=True, lower=True)
    assert w3['spider'][0] == 1
Пример #11
0
def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100):
    """
    Simple method to score embedding using SimpleAnalogySolver

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    method : {"add", "mul"}
      Method to use when finding analogy answer, see "Improving Distributional Similarity
      with Lessons Learned from Word Embeddings"

    X : array-like, shape (n_samples, 3)
      Analogy questions.

    y : array-like, shape (n_samples, )
      Analogy answers.

    k : int, default: None
      If not None will select k top most frequent words from embedding

    batch_size : int, default: 100
      Increase to increase memory consumption and decrease running time

    category : list, default: None
      Category of each example, if passed function returns accuracy per category
      in addition to the overall performance.
      Analogy datasets have "category" field that can be supplied here.

    Returns
    -------
    result: dict
      Results, where each key is for given category and special empty key "" stores
      summarized accuracy across categories
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert category is None or len(category) == y.shape[0], "Passed incorrect category list"

    solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k)
    y_pred = solver.predict(X)

    if category is not None:
        results = OrderedDict({"all": np.mean(y_pred == y)})
        count = OrderedDict({"all": len(y_pred)})
        correct = OrderedDict({"all": np.sum(y_pred == y)})
        for cat in set(category):
            results[cat] = np.mean(y_pred[category == cat] == y[category == cat])
            count[cat] = np.sum(category == cat)
            correct[cat] = np.sum(y_pred[category == cat] == y[category == cat])

        return pd.concat([pd.Series(results, name="accuracy"),
                          pd.Series(correct, name="correct"),
                          pd.Series(count, name="count")],
                         axis=1)
    else:
        return np.mean(y_pred == y)
Пример #12
0
def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100):
    """
    Simple method to score embedding using SimpleAnalogySolver

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    method : {"add", "mul"}
      Method to use when finding analogy answer, see "Improving Distributional Similarity
      with Lessons Learned from Word Embeddings"

    X : array-like, shape (n_samples, 3)
      Analogy questions.

    y : array-like, shape (n_samples, )
      Analogy answers.

    k : int, default: None
      If not None will select k top most frequent words from embedding

    batch_size : int, default: 100
      Increase to increase memory consumption and decrease running time

    category : list, default: None
      Category of each example, if passed function returns accuracy per category
      in addition to the overall performance.
      Analogy datasets have "category" field that can be supplied here.

    Returns
    -------
    result: dict
      Results, where each key is for given category and special empty key "" stores
      summarized accuracy across categories
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert category is None or len(category) == y.shape[0], "Passed incorrect category list"

    solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k)
    y_pred = solver.predict(X)

    if category is not None:
        results = OrderedDict({"all": np.mean(y_pred == y)})
        count = OrderedDict({"all": len(y_pred)})
        correct = OrderedDict({"all": np.sum(y_pred == y)})
        for cat in set(category):
            results[cat] = np.mean(y_pred[category == cat] == y[category == cat])
            count[cat] = np.sum(category == cat)
            correct[cat] = np.sum(y_pred[category == cat] == y[category == cat])

        return pd.concat([pd.Series(results, name="accuracy"),
                          pd.Series(correct, name="correct"),
                          pd.Series(count, name="count")],
                         axis=1)
    else:
        return np.mean(y_pred == y)
Пример #13
0
def evaluate_on_semeval_2012_2(w):
    """
    Simple method to score embedding using SimpleAnalogySolver

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    Returns
    -------
    result: pandas.DataFrame
      Results with spearman correlation per broad category with special key "all" for summary
      spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_semeval_2012_2()
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    categories = data.y.keys()
    results = defaultdict(list)
    for c in categories:
        # Get mean of left and right vector
        prototypes = data.X_prot[c]
        prot_left = np.mean(
            np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]),
            axis=0,
        )
        prot_right = np.mean(
            np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]),
            axis=0,
        )

        questions = data.X[c]
        question_left, question_right = (
            np.vstack(w.get(word, mean_vector) for word in questions[:, 0]),
            np.vstack(w.get(word, mean_vector) for word in questions[:, 1]),
        )

        scores = np.dot(prot_left - prot_right,
                        (question_left - question_right).T)

        c_name = data.categories_names[c].split("_")[0]
        # NaN happens when there are only 0s, which might happen for very rare words or
        # very insufficient word vocabulary
        cor = scipy.stats.spearmanr(scores, data.y[c]).correlation
        results[c_name].append(0 if np.isnan(cor) else cor)

    final_results = OrderedDict()
    final_results["all"] = sum(sum(v)
                               for v in results.values()) / len(categories)
    for k in results:
        final_results[k] = sum(results[k]) / len(results[k])
    return pd.Series(final_results)
Пример #14
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in words:
                missing_words += 1


#     if missing_words > 0:
#         print("Missing {} words. Will replace them with mean vector".format(missing_words))

    new_x = []
    new_y = []
    exist_cnt = 0

    for i in range(len(X)):
        if X[i, 0] in words and X[i, 1] in words:
            new_x.append(X[i])
            new_y.append(y[i])
            exist_cnt += 1

    print('exist {} in {}'.format(exist_cnt, len(X)))
    X = np.array(new_x)
    y = np.array(new_y)

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
    #     scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #15
0
def evaluate_similarity(w, X, y, missing_words='mean'):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    n_missing_words = count_missing_words(w, X)
    if n_missing_words > 0:
        logger.warning("Missing {} words.".format(n_missing_words))

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A, B = [], []
    if missing_words == 'mean' or n_missing_words == 0:
        if n_missing_words:
            logger.info(
                "Will replace them with mean vector".format(missing_words))
        A = [w.get(word, mean_vector) for word in X[:, 0]]
        B = [w.get(word, mean_vector) for word in X[:, 1]]
    elif missing_words == 'filter_out':
        logger.info("Will ignore them")
        y_filtered = []
        for x, gt in zip(X, y):
            a, b = x
            if a not in w or b not in w:
                continue
            A.append(w.get(a, mean_vector))
            B.append(w.get(b, mean_vector))
            y_filtered.append(gt)
        y = np.asarray(y_filtered)

    #A = np.asarray([w.get(word, mean_vector) for word in X[:, 0]])
    #B = np.asarray([w.get(word, mean_vector) for word in X[:, 1]])
    scores = np.array([cosine_similarity(v1, v2) for v1, v2 in zip(A, B)])
    #scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #16
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in words:
                missing_words += 1
    if missing_words > 0:
        logger.warning(
            "Missing {} words. Will replace them with mean vector".format(
                missing_words))

    #avs: mean_vector for missing_words
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)

    #avs: getting the vector for each word
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])

    #avs: calculate the cosine distance between the 2 vectores
    # why v1.dot(v2.T): because we are working with matrixes !!!  http://www.thefactmachine.com/cosine-similarity/
    scores = np.array([
        v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        for v1, v2 in zip(A, B)
    ])
    return scipy.stats.spearmanr(scores, y).correlation
def evaluate_word_analogy(wv, w2i, vocab):
    W_norm = np.zeros(wv.shape)
    d = (np.sum(wv**2, 1)**(0.5))
    W_norm = (wv.T / d).T

    evaluate_analogy_msr(W_norm, w2i)
    evaluate_analogy_google(W_norm, w2i)

    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = W_norm[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)
    evaluate_analogy_semeval2012(w)
Пример #18
0
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float:
    """
    Computes intrinsic scores for embeddings and dumps the embeddings embeddings
    Parameters
    ----------
    epoch:        Current epoch number
    sess:        Tensorflow session object
    Returns
    -------
    """
    voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()}
    embedding = Embedding.from_dict(voc2vec)
    results = evaluate_on_all(embedding)
    results = {key: round(val[0], 4) for key, val in results.items()}
    curr_int = np.mean(list(results.values()))
    return curr_int
Пример #19
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    idx = set()
    for i, query in enumerate(X):
        missing_words = 0
        for query_word in query:
            if query_word not in words:
                missing_words += 1
                break
        if missing_words == 0:
            idx.add(i)
    idx = list(idx)

    y = y[idx]
    X = X[idx]

    A = np.vstack(w.get(word) for word in X[:, 0])
    B = np.vstack(w.get(word) for word in X[:, 1])
    scores = np.array([
        v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        for v1, v2 in zip(A, B)
    ])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #20
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    idx = []
    for i, query in enumerate(X):
        if query[0] not in words or query[1] not in words:
            missing_words += 1
        else:
            idx.append(i)

    if missing_words > 0:
        logger.warning("Missing {} pairs. ".format(missing_words))

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
    scores = np.array([
        v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        for v1, v2 in zip(A, B)
    ])
    print("norms", np.linalg.norm(A[0]), len(idx))
    #print("scores",np.count_nonzero(np.isnan(A)))
    return scipy.stats.spearmanr(scores[idx], y[idx]).correlation
Пример #21
0
def load_embedding(fname,
                   format="word2vec_bin",
                   normalize=True,
                   lower=False,
                   clean_words=False,
                   load_kwargs={}):
    """
    Loads embeddings from file

    Parameters
    ----------
    fname: string
      Path to file containing embedding

    format: string
      Format of the embedding. Possible values are:
      'word2vec_bin', 'word2vec', 'glove', 'dict'

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.
    """
    assert format in ['word2vec_bin', 'word2vec', 'glove',
                      'dict'], "Unrecognized format"
    if format == "word2vec_bin":
        w = Embedding.from_word2vec(fname, binary=True)
    elif format == "word2vec":
        w = Embedding.from_word2vec(fname, binary=False)
    elif format == "glove":
        w = Embedding.from_glove(fname, **load_kwargs)
    elif format == "dict":
        d = pickle.load(open(fname, "rb"), encoding='latin1')
        w = Embedding.from_dict(d)
    if normalize:
        w.normalize_words(inplace=True)
    if lower or clean_words:
        w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
    return w
Пример #22
0
def evaluate_on_semeval_2012_2(w):
    """
    Simple method to score embedding using SimpleAnalogySolver

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    Returns
    -------
    result: pandas.DataFrame
      Results with spearman correlation per broad category with special key "all" for summary
      spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_semeval_2012_2()
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    categories = data.y.keys()
    results = defaultdict(list)
    for c in categories:
        # Get mean of left and right vector
        prototypes = data.X_prot[c]
        prot_left = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]), axis=0)
        prot_right = np.mean(np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]), axis=0)

        questions = data.X[c]
        question_left, question_right = np.vstack(w.get(word, mean_vector) for word in questions[:, 0]), \
                                        np.vstack(w.get(word, mean_vector) for word in questions[:, 1])

        scores = np.dot(prot_left - prot_right, (question_left - question_right).T)

        c_name = data.categories_names[c].split("_")[0]
        # NaN happens when there are only 0s, which might happen for very rare words or
        # very insufficient word vocabulary
        cor = scipy.stats.spearmanr(scores, data.y[c]).correlation
        results[c_name].append(0 if np.isnan(cor) else cor)

    final_results = OrderedDict()
    final_results['all'] = sum(sum(v) for v in results.values()) / len(categories)
    for k in results:
        final_results[k] = sum(results[k]) / len(results[k])
    return pd.Series(final_results)
Пример #23
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in words:
                missing_words += 1
    if missing_words > 0:
        logger.warning("Missing {} words. Will replace them with mean vector".format(missing_words))

    #avs: mean_vector for missing_words
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)

    #avs: getting the vector for each word
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])

    #avs: calculate the cosine distance between the 2 vectores
    # why v1.dot(v2.T): because we are working with matrixes !!!  http://www.thefactmachine.com/cosine-similarity/
    scores = np.array([v1.dot(v2.T)/(np.linalg.norm(v1)*np.linalg.norm(v2)) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #24
0
def evaluate_cate(wv, w2i, vocab, method="all", seed=None):
    """
    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.
    seed: int, default: None
      Seed passed to KMeans.
    """
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on categorization
    print("Calculating categorization benchmarks")
    categorization_tasks = {
        "AP": fetch_AP(),
        "ESSLI_2c": fetch_ESSLI_2c(),
        "ESSLI_2b": fetch_ESSLI_2b(),
        "ESSLI_1a": fetch_ESSLI_1a(),
        "Battig": fetch_battig(),
        "BLESS": fetch_BLESS(),
    }

    categorization_results = {}

    # Calculate results using helper function
    for name, data in iteritems(categorization_tasks):
        print(
            "Sample data from {}, num of samples: {} : \"{}\" is assigned class {}"
            .format(name, len(data.X), data.X[0], data.y[0]))
        categorization_results[name] = evaluate_categorization(w,
                                                               data.X,
                                                               data.y,
                                                               method=method,
                                                               seed=None)
        print("Cluster purity on {} {}".format(name,
                                               categorization_results[name]))
Пример #25
0
def evaluate_similarity(w, X, y):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    Returns
    -------
    cor: float
      Spearman correlation
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words = 0
    words = w.vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in words:
                missing_words += 1
    if missing_words > 0:
        logger.warning(
            "Missing {} words. Will replace them with mean vector".format(
                missing_words))

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack([w.get(word, mean_vector) for word in X[:, 0]])
    B = np.vstack([w.get(word, mean_vector) for word in X[:, 1]])
    scores = np.array([
        v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        for v1, v2 in zip(A, B)
    ])
    return scipy.stats.spearmanr(scores, y).correlation
Пример #26
0
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}):
    """
    Evaluate on WordRep dataset

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    max_pairs: int, default: 1000
      Each category will be constrained to maximum of max_pairs pairs
      (which results in max_pair * (max_pairs - 1) examples)

    solver_kwargs: dict, default: {}
      Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words
      in the dictionary.

    References
    ----------
    Bin Gao, Jiang Bian, Tie-Yan Liu (2015)
     "WordRep: A Benchmark for Research on Learning Word Representations"
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_wordrep()
    categories = set(data.category)

    accuracy = {}
    correct = {}
    count = {}
    for cat in categories:
        X_cat = data.X[data.category == cat]
        X_cat = X_cat[0:max_pairs]

        logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0]
                                                                       , X_cat.shape[0] * (X_cat.shape[0] - 1)))

        # For each category construct question-answer pairs
        size = X_cat.shape[0] * (X_cat.shape[0] - 1)
        X = np.zeros(shape=(size, 3), dtype="object")
        y = np.zeros(shape=(size,), dtype="object")
        id = 0
        for left, right in product(X_cat, X_cat):
            if not np.array_equal(left, right):
                X[id, 0:2] = left
                X[id, 2] = right[0]
                y[id] = right[1]
                id += 1

        # Run solver
        solver = SimpleAnalogySolver(w=w, **solver_kwargs)
        y_pred = solver.predict(X)
        correct[cat] = float(np.sum(y_pred == y))
        count[cat] = size
        accuracy[cat] = float(np.sum(y_pred == y)) / size

    # Add summary results
    correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories)
    correct['all'] = sum(correct[c] for c in categories)
    correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories)

    count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories)
    count['all'] = sum(count[c] for c in categories)
    count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories)

    accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia']
    accuracy['all'] = correct['all'] / count['all']
    accuracy['wordnet'] = correct['wordnet'] / count['wordnet']

    return pd.concat([pd.Series(accuracy, name="accuracy"),
                      pd.Series(correct, name="correct"),
                      pd.Series(count, name="count")], axis=1)
    print("Spearman correlation of scores on {} {}".format(
        'WS353', evaluate_similarity(embeddings, X, y)))
    X, y = fetch_dataset_MEN(MEN)
    print("Spearman correlation of scores on {} {}".format(
        'MEN', evaluate_similarity(embeddings, X, y)))
    X, y = fetch_dataset_SIM999(SIM999)
    print("Spearman correlation of scores on {} {}".format(
        'SIM999', evaluate_similarity(embeddings, X, y)))


if __name__ == '__main__':
    preprocess_base_dir = '../data/wikipedia/preprocess1B/NumeralAsNumeral'
    vec_base_dir = '../data/wikipedia/save/1B/prototypes/'
    # postfix = ['LSTM', 'NumeralAsToken', 'NumeralAsTokenUnkNumeral','NumeralAsUnkNumeral']
    # postfix = ['50','100','200','300']
    # postfix = ['NumeralAsToken', 'NumeralAsToken3','NumeralAsToken8']
    # postfix = ['NumeralAsTokenUnkNumeral5_300']
    postfix = ['3', '5']

    for p in postfix:

        vec = glob.glob(vec_base_dir + '/{}/idx2vec_i*.dat'.format(p))

        idx2word_path = preprocess_base_dir + '/idx2word.dat'
        idx2word = pickle.load(open(idx2word_path, 'rb'))
        for v in vec:
            print('evaluate vector file {}, in {}'.format(v, p))
            idx2vec = pickle.load(open(v, 'rb'))
            dicts = {idx2word[i]: idx2vec[i] for i in range(len(idx2vec))}
            embeddings = Embedding.from_dict(dicts)
            evaluate(embeddings)
def evaluate_similarity(w,
                        X,
                        y,
                        tokenize_oov_words_with_deepcut=False,
                        filter_not_found=False,
                        include_structured_sources=None,
                        cut_letters_for_oov=False,
                        structed_sources_coef=0,
                        numberbatch=None):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    tokenize_oov_words_with_deepcut:
        if a thai word is not found in the embedding (OOV), tokenize it with deepcut, and try to use the sum vector of its parts?

    filter_not_found:
        remove a word pair if one of the words was not found in the embedding vocabulary

    include_structured_sources:
        include using structed source. None or name

    structed_sources_coef:
        weight for structed source

    Returns
    -------
    cor: float
      Spearman correlation
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words, found_words, oov_vecs_created, index = 0, 0, 0, 0
    word_pair_oov_indices = []
    info_oov_words = {}
    info_created_words = {}

    words = w.vocabulary.word_id

    ## NEW: use deepcut to create word vectors of word parts -- if possible
    if tokenize_oov_words_with_deepcut:

        # a) create set of OOV words in the dataset
        oov_words = set()
        for query in X:
            for query_word in query:
                if query_word not in words:
                    oov_words.add(query_word)

        # b) iterate over OOV words and see if we can set a vector from them
        for ds_word in oov_words:

            tokens = deepcut.tokenize(ds_word)
            in_voc_tokens = [tok for tok in tokens if tok in w]

            ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word
            if in_voc_tokens:
                token_vecs = [w.get(t) for t in in_voc_tokens]
                w[ds_word] = np.mean(token_vecs, axis=0)
                # print("Created vector for OOV word:", ds_word)
                oov_vecs_created += 1
                info_created_words[ds_word] = in_voc_tokens
            else:
                info_oov_words[ds_word] = tokens

        print('All OOV words after deepcut:')
        pprint(info_oov_words)
        print('All "created"/replaced words by deepcut:')
        pprint(info_created_words)

    elif cut_letters_for_oov:
        oov_words = set()

        #collecting oov words
        for query in X:
            for query_word in query:
                if query_word not in words:
                    oov_words.add(query_word)

        #iterating through each oov-word
        for oov_word in oov_words:
            cut_word = oov_word
            words_with_same_prefix = set()

            # cutting letter by letter until we find some words with the same prefix
            while len(cut_word) and cut_word not in words:
                cut_word = cut_word[:-1]

                # collectings words with the same prefix
                for vocabulary_word in w:
                    if vocabulary_word[0].startswith(cut_word):
                        words_with_same_prefix.add(vocabulary_word[0])

                # if found at least one word, then stop cutting and let's compute the avg vector
                if len(words_with_same_prefix):
                    break
            print(
                f'FOR WORD {oov_word} FOUND WORDS WITH THE SAME PREFIX: {str(words_with_same_prefix)}'
            )
            if words_with_same_prefix:
                token_vecs = [w.get(t) for t in words_with_same_prefix]
                w[oov_word] = np.mean(token_vecs, axis=0)
                oov_vecs_created += 1
                info_created_words[oov_word] = cut_word

    ## For all words in the datasets, check if the are OOV?
    ## Indices of word-pairs with a OOV word are stored in word_pair_oov_indices
    for query in X:
        for query_word in query:

            if query_word not in words:
                print("Missing Word:", query_word)
                missing_words += 1
                word_pair_oov_indices.append(index)
            else:
                print("Found Word:", query_word)
                found_words += 1
        index += 1

    word_pair_oov_indices = list(set(word_pair_oov_indices))
    print('word_pair_oov_indices', word_pair_oov_indices)

    if missing_words > 0 or oov_vecs_created > 0:
        logger.warning(
            "Missing {} words. Will replace them with mean vector".format(
                missing_words))
        logger.warning(
            "OOV words {} created from their subwords. Will replace them with mean vector of sub-tokens"
            .format(oov_vecs_created))
        logger.warning("Found {} words.".format(found_words))

    print('X.shape', X.shape)
    print('y.shape', y.shape)

    if filter_not_found:
        # added code by wohlg
        new_X = np.delete(X, word_pair_oov_indices, 0)
        # print(new_X)
        new_y = np.delete(y, word_pair_oov_indices)

        print('new_X.shape', new_X.shape)
        print('new_y.shape', new_y.shape)

        mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
        A = np.vstack(w.get(word, mean_vector) for word in new_X[:, 0])
        B = np.vstack(w.get(word, mean_vector) for word in new_X[:, 1])
        print(len(A), len(B))
        print(type(A), type(B))
        scores = np.array([
            v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            for v1, v2 in zip(A, B)
        ])

        y = new_y
        pairs = new_X

    else:
        # orig code
        mean_vector = np.mean(w.vectors, axis=0, keepdims=True)

        A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
        B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
        scores = np.array([
            v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            for v1, v2 in zip(A, B)
        ])
        pairs = X

    # alexpulich / wohlg:
    if include_structured_sources == 'wn':
        wn_scores, structed_oov_pairs = compute_wordnet_path_scores(pairs)
        ## wordnet_method1 or wordnet_method2: currently hardcoded, can be refactored if needed :)
        scores = wordnet_method1(list(scores), pairs, wn_scores,
                                 structed_sources_coef)
        # scores = wordnet_method2(list(scores), pairs, wn_scores, structed_sources_coef)
    elif include_structured_sources == 'cn':
        #should comment, if don't want to use numberbatch
        cn_scores, structed_oov_pairs = compute_conceptnet_path_scores(
            pairs, numberbatch)
        scores = conceptnet_method1(list(scores), pairs, cn_scores,
                                    structed_sources_coef)
        # scores = conceptnet_method2(list(scores), pairs, cn_scores, structed_sources_coef)

    # wohlg: original version only returned Spearman
    # wohlg: we added Pearson and other information
    result = {
        'spearmanr': scipy.stats.spearmanr(scores, y).correlation,
        'pearsonr': scipy.stats.pearsonr(scores, y)[0],
        'num_oov_word_pairs': len(word_pair_oov_indices),
        'num_found_words': found_words,
        'num_missing_words': missing_words,
        'num_oov_created': oov_vecs_created,
        'y.shape': y.shape
    }

    if include_structured_sources:
        result['structed_oov_pairs'] = structed_oov_pairs

    return result
Пример #29
0
def evaluate_on_all(w):
    """
    Evaluate Embedding on all fast-running benchmarks

    Parameters
    ----------
    w: Embedding or dict
      Embedding to evaluate.

    Returns
    -------
    results: pandas.DataFrame
      DataFrame with results, one per column.
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {
        "MEN": fetch_MEN(),
        "WS353": fetch_WS353(),
        "WS353R": fetch_WS353(which="relatedness"),
        "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "RW": fetch_RW(),
        "RG65": fetch_RG65(),
        "MTurk": fetch_MTurk(),
    }

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(
            name, similarity_results[name]))

    # Calculate results on analogy
    logger.info("Calculating analogy benchmarks")
    analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
    }

    analogy_results = {}

    for name, data in iteritems(analogy_tasks):
        analogy_results[name] = evaluate_analogy(w, data.X, data.y)
        logger.info("Analogy prediction accuracy on {} {}".format(
            name, analogy_results[name]))

    analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format(
        "SemEval2012", analogy_results["SemEval2012_2"]))

    # Calculate results on categorization
    logger.info("Calculating categorization benchmarks")
    categorization_tasks = {
        "AP": fetch_AP(),
        "BLESS": fetch_BLESS(),
        "Battig": fetch_battig(),
        "ESSLI_2c": fetch_ESSLI_2c(),
        "ESSLI_2b": fetch_ESSLI_2b(),
        "ESSLI_1a": fetch_ESSLI_1a()
    }

    categorization_results = {}

    # Calculate results using helper function
    for name, data in iteritems(categorization_tasks):
        categorization_results[name] = evaluate_categorization(
            w, data.X, data.y)
        logger.info("Cluster purity on {} {}".format(
            name, categorization_results[name]))

    # Construct pd table
    cat = pd.DataFrame([categorization_results])
    analogy = pd.DataFrame([analogy_results])
    sim = pd.DataFrame([similarity_results])
    results = cat.join(sim).join(analogy)

    return results
Пример #30
0
def evaluate_categorization(w, X, y, method="all", seed=None):
    """
    Evaluate embeddings on categorization task.

    Parameters
    ----------
    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    Returns
    -------
    purity: float
      Purity of the best obtained clustering.

    Notes
    -----
    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
    tasks available in the package).
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative", "mean-shift", "spectral", "affinityPropagation", "birch"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
    ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                       affinity="euclidean",
                                                                       linkage="ward").fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
        for affinity in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                          affinity=affinity,
                                                                          linkage=linkage).fit_predict(words[ids]))
                logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
                                  fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)
        
    if method == "all" or method == "mean-shift":
        # # # If takes too long: Note that the estimate_bandwidth function
        # # # is much less scalable than the mean shift algorithm 
        # # # and will be the bottleneck if it is used.
        purity = calculate_purity(y[ids], MeanShift(bin_seeding = True, n_jobs=5).fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using MeanShift".format(purity))
        best_purity = max(purity, best_purity)
        
    if method == "all" or method == "spectral":
        for affinity in ['nearest_neighbors', 'rbf']:
            purity = calculate_purity(y[ids], SpectralClustering(n_clusters=len(set(y)),
                                                                 affinity=affinity,
                                                                 random_state=seed,
                                                                 n_jobs=5).fit_predict(words[ids]))
            logger.debug("Purity={:.3f} using SpectralClustering affinity={}".format(purity, affinity))
            best_purity = max(purity, best_purity)
          
    if method == "all" or method == "affinityPropagation":
        # preference : array-like, shape (n_samples,) or float, optional
        #
        # Preferences for each point - points with larger values of preferences are more
        # likely to be chosen as exemplars. The number of exemplars, ie of clusters, is
        # influenced by the input preferences value. If the preferences are not passed as
        # arguments, they will be set to the median of the input similarities.

        for affinity in ["cosine", "euclidean"]:
            purity = calculate_purity(y[ids], AffinityPropagation(preference=None, affinity=affinity).fit_predict(words[ids]))
            logger.debug("Purity={:.3f} using Affinity Propagation".format(purity))
            best_purity = max(purity, best_purity)


    if method == "all" or method == "birch":
        purity = calculate_purity(y[ids], Birch(threshold=0.5, branching_factor=50, n_clusters=len(set(y))).fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using Birch".format(purity))
        best_purity = max(purity, best_purity)
     
    return best_purity
Пример #31
0
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}):
    """
    Evaluate on WordRep dataset

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    max_pairs: int, default: 1000
      Each category will be constrained to maximum of max_pairs pairs
      (which results in max_pair * (max_pairs - 1) examples)

    solver_kwargs: dict, default: {}
      Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words
      in the dictionary.

    References
    ----------
    Bin Gao, Jiang Bian, Tie-Yan Liu (2015)
     "WordRep: A Benchmark for Research on Learning Word Representations"
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_wordrep()
    categories = set(data.category)

    accuracy = {}
    correct = {}
    count = {}
    for cat in categories:
        X_cat = data.X[data.category == cat]
        X_cat = X_cat[0:max_pairs]

        logger.info("Processing {} with {} pairs, {} questions".format(
            cat, X_cat.shape[0], X_cat.shape[0] * (X_cat.shape[0] - 1)))

        # For each category construct question-answer pairs
        size = X_cat.shape[0] * (X_cat.shape[0] - 1)
        X = np.zeros(shape=(size, 3), dtype="object")
        y = np.zeros(shape=(size, ), dtype="object")
        id = 0
        for left, right in product(X_cat, X_cat):
            if not np.array_equal(left, right):
                X[id, 0:2] = left
                X[id, 2] = right[0]
                y[id] = right[1]
                id += 1

        # Run solver
        solver = SimpleAnalogySolver(w=w, **solver_kwargs)
        y_pred = solver.predict(X)
        correct[cat] = float(np.sum(y_pred == y))
        count[cat] = size
        accuracy[cat] = float(np.sum(y_pred == y)) / size

    # Add summary results
    correct['wikipedia'] = sum(correct[c] for c in categories
                               if c in data.wikipedia_categories)
    correct['all'] = sum(correct[c] for c in categories)
    correct['wordnet'] = sum(correct[c] for c in categories
                             if c in data.wordnet_categories)

    count['wikipedia'] = sum(count[c] for c in categories
                             if c in data.wikipedia_categories)
    count['all'] = sum(count[c] for c in categories)
    count['wordnet'] = sum(count[c] for c in categories
                           if c in data.wordnet_categories)

    accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia']
    accuracy['all'] = correct['all'] / count['all']
    accuracy['wordnet'] = correct['wordnet'] / count['wordnet']

    return pd.concat([
        pd.Series(accuracy, name="accuracy"),
        pd.Series(correct, name="correct"),
        pd.Series(count, name="count")
    ],
                     axis=1)
def evaluate_similarity(w,
                        X,
                        y,
                        tokenize_oov_words_with_deepcut=False,
                        filter_not_found=False):
    """
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    Parameters
    ----------
    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    tokenize_oov_words_with_deepcut: 
        if a thai word is not found in the embedding (OOV), tokenize it with deepcut, and try to use the sum vector of its parts?

    filter_not_found:
        remove a word pair if one of the words was not found in the embedding vocabulary

    Returns
    -------
    cor: float
      Spearman correlation
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    missing_words, found_words, oov_vecs_created, index = 0, 0, 0, 0
    word_pair_oov_indices = []
    info_oov_words = {}
    info_created_words = {}

    words = w.vocabulary.word_id

    ## NEW: use deepcut to create word vectors of word parts -- if possible
    if tokenize_oov_words_with_deepcut:

        # a) create set of OOV words in the dataset
        oov_words = set()
        for query in X:
            for query_word in query:
                if query_word not in words:
                    oov_words.add(query_word)

        # b) iterate over OOV words and see if we can set a vector from them
        for ds_word in oov_words:

            tokens = deepcut.tokenize(ds_word)
            in_voc_tokens = [tok for tok in tokens if tok in w]

            ## if we found word-parts in the emb - use their vectors (avg) to represent the OOV word
            if in_voc_tokens:
                token_vecs = [w.get(t) for t in in_voc_tokens]
                w[ds_word] = np.mean(token_vecs, axis=0)
                #print("Created vector for OOV word:", ds_word)
                oov_vecs_created += 1
                info_created_words[ds_word] = in_voc_tokens
            else:
                info_oov_words[ds_word] = tokens

        print('All OOV words after deepcut:')
        pprint(info_oov_words)
        print('All "created"/replaced words by deepcut:')
        pprint(info_created_words)

    ## For all words in the datasets, check if the are OOV?
    ## Indices of word-pairs with a OOV word are stored in word_pair_oov_indices
    for query in X:
        for query_word in query:

            if query_word not in words:
                print("Missing Word:", query_word)
                missing_words += 1
                word_pair_oov_indices.append(index)
            else:
                print("Found Word:", query_word)
                found_words += 1
        index += 1

    word_pair_oov_indices = list(set(word_pair_oov_indices))
    print('word_pair_oov_indices', word_pair_oov_indices)

    if missing_words > 0 or oov_vecs_created > 0:
        logger.warning(
            "Missing {} words. Will replace them with mean vector".format(
                missing_words))
        logger.warning(
            "OOV words {} created from their subwords. Will replace them with mean vector of sub-tokens"
            .format(oov_vecs_created))
        logger.warning("Found {} words.".format(found_words))

    print('X.shape', X.shape)
    print('y.shape', y.shape)

    if filter_not_found:
        # added code by wohlg
        new_X = np.delete(X, word_pair_oov_indices, 0)
        #print(new_X)
        new_y = np.delete(y, word_pair_oov_indices)

        print('new_X.shape', new_X.shape)
        print('new_y.shape', new_y.shape)

        mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
        A = np.vstack(w.get(word, mean_vector) for word in new_X[:, 0])
        B = np.vstack(w.get(word, mean_vector) for word in new_X[:, 1])
        print(len(A), len(B))
        print(type(A), type(B))
        scores = np.array([
            v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            for v1, v2 in zip(A, B)
        ])

        y = new_y

    else:
        # orig code
        mean_vector = np.mean(w.vectors, axis=0, keepdims=True)

        A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
        B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
        scores = np.array([
            v1.dot(v2.T) / (np.linalg.norm(v1) * np.linalg.norm(v2))
            for v1, v2 in zip(A, B)
        ])

    ## insert new code here
    # print(X)
    # print(scores)
    # print(scores.shape)
    # import sys; sys.exit()

    # if include_structured_sources: # given as argv when calling the script?

#   scores = add_structured_info(X, scores) # call to function outside

# wohlg: original version only returned Spearman
# wohlg: we added Pearson and other information
    result = {
        'spearmanr': scipy.stats.spearmanr(scores, y).correlation,
        'pearsonr': scipy.stats.pearsonr(scores, y)[0],
        'num_oov_word_pairs': len(word_pair_oov_indices),
        'num_found_words': found_words,
        'num_missing_words': missing_words,
        'num_oov_created': oov_vecs_created,
        'y.shape': y.shape
    }

    return result
Пример #33
0
def evaluate_on_all(w,word_embedding_name):
    """
    Evaluate Embedding on all fast-running benchmarks

    Parameters
    ----------
    w: Embedding or dict
      Embedding to evaluate.

    Returns
    -------
    results: pandas.DataFrame
      DataFrame with results, one per column.
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {
        "MTurk": fetch_MTurk(),
        "MEN": fetch_MEN(),
        "WS353": fetch_WS353(),
        "Rubenstein_and_Goodenough": fetch_RG65(),
        "Rare_Words": fetch_RW(),
        "SimLex999": fetch_SimLex999(),
        "TR9856": fetch_TR9856(),
    }

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))

    # Calculate results on analogy
    logger.info("Calculating analogy benchmarks")
    analogy_tasks = {
        "Google_analogy": fetch_google_analogy(),
        "MSR": fetch_msr_analogy(),
        # "SEMEVAL 2012 Task 2" 
    }

    analogy_results = {}

    for name, data in iteritems(analogy_tasks):
        analogy_results[name] = evaluate_analogy(w, data.X, data.y)
        logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))

    analogy_results["MSR_WordRep"] = evaluate_on_WordRep(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format("MSR_WordRep", analogy_results["MSR_WordRep"]))

    analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format("SemEval_2012_Task_2", analogy_results["SemEval2012_2"]))

    # # Calculate results on categorization
    # logger.info("Calculating categorization benchmarks")
    # categorization_tasks = {
    #     "AP": fetch_AP(),
    #     "BLESS": fetch_BLESS(),
    #     "Battig": fetch_battig(),
    #     "ESSLI_2c": fetch_ESSLI_2c(),
    #     "ESSLI_2b": fetch_ESSLI_2b(),
    #     "ESSLI_1a": fetch_ESSLI_1a()
    # }

    # categorization_results = {}

    # # Calculate results using helper function
    # for name, data in iteritems(categorization_tasks):
    #     categorization_results[name] = evaluate_categorization(w, data.X, data.y)
    #     logger.info("Cluster purity on {} {}".format(name, categorization_results[name]))

    # # Construct pd table
    # cat = pd.DataFrame([categorization_results])
    analogy = pd.DataFrame([analogy_results])
    sim = pd.DataFrame([similarity_results])
    # results = cat.join(sim).join(analogy)
    w_name = {"word_embedding": word_embedding_name}
    w_name = pd.DataFrame([w_name])
    results = w_name.join(sim).join(analogy)

    # results = sim.join(analogy)

    return results
Пример #34
0
def evaluate_on_all(w):
    """
    Evaluate Embedding on all fast-running benchmarks

    Parameters
    ----------
    w: Embedding or dict
      Embedding to evaluate.

    Returns
    -------
    results: pandas.DataFrame
      DataFrame with results, one per column.
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {
        "MEN": fetch_MEN(),
        "WS353": fetch_WS353(),
        "WS353R": fetch_WS353(which="relatedness"),
        "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "RW": fetch_RW(),
        "RG65": fetch_RG65(),
        "MTurk": fetch_MTurk(),
    }

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))

    # Calculate results on analogy
    logger.info("Calculating analogy benchmarks")
    analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()
    }

    analogy_results = {}

    for name, data in iteritems(analogy_tasks):
        analogy_results[name] = evaluate_analogy(w, data.X, data.y)
        logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))

    analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

    # Calculate results on categorization
    logger.info("Calculating categorization benchmarks")
    categorization_tasks = {
        "AP": fetch_AP(),
        "BLESS": fetch_BLESS(),
        "Battig": fetch_battig(),
        "ESSLI_2c": fetch_ESSLI_2c(),
        "ESSLI_2b": fetch_ESSLI_2b(),
        "ESSLI_1a": fetch_ESSLI_1a()
    }

    categorization_results = {}

    # Calculate results using helper function
    for name, data in iteritems(categorization_tasks):
        categorization_results[name] = evaluate_categorization(w, data.X, data.y)
        logger.info("Cluster purity on {} {}".format(name, categorization_results[name]))

    # Construct pd table
    cat = pd.DataFrame([categorization_results])
    analogy = pd.DataFrame([analogy_results])
    sim = pd.DataFrame([similarity_results])
    results = cat.join(sim).join(analogy)

    return results
Пример #35
0
def evaluate_categorization(w, X, y, method="all", seed=None):
    """
    Evaluate embeddings on categorization task.

    Parameters
    ----------
    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    Returns
    -------
    purity: float
      Purity of the best obtained clustering.

    Notes
    -----
    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
    tasks available in the package).
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    w.oov = 0
    words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
    print('{} oov words out of {}'.format(w.oov, len(X.flatten())))
    ids = np.random.RandomState(seed).choice(range(len(X)),
                                             len(X),
                                             replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(
            y[ids],
            AgglomerativeClustering(n_clusters=len(set(y)),
                                    affinity="euclidean",
                                    linkage="ward").fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(
            best_purity, 'euclidean', 'ward'))
        for affinity in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(
                    y[ids],
                    AgglomerativeClustering(n_clusters=len(set(y)),
                                            affinity=affinity,
                                            linkage=linkage).fit_predict(
                                                words[ids]))
                logger.debug(
                    "Purity={:.3f} using affinity={} linkage={}".format(
                        purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(
            y[ids],
            KMeans(random_state=seed, n_init=10,
                   n_clusters=len(set(y))).fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)

    return best_purity
Пример #36
0
def evaluate_categorization(w, X, y, method="all", seed=None):
    """
    Evaluate embeddings on categorization task.

    Parameters
    ----------
    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    Returns
    -------
    purity: float
      Purity of the best obtained clustering.

    Notes
    -----
    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
    tasks available in the package).
    """

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    w.oov = 0
    words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
    print ('{} oov words out of {}'.format(w.oov, len(X.flatten())))
    ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                       affinity="euclidean",
                                                                       linkage="ward").fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
        for affinity in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                                                                          affinity=affinity,
                                                                          linkage=linkage).fit_predict(words[ids]))
                logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
                                  fit_predict(words[ids]))
        logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)

    return best_purity
def evaluate_on_all(w, entity_benchmark=False, fastText_ML=False):
    """
    Evaluate Embedding on all fast-running benchmarks

    Parameters
    ----------
    w: Embedding or dict
      Embedding to evaluate.

    Returns
    -------
    results: pandas.DataFrame
      DataFrame with results, one per column.
    """
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {
        "MEN": fetch_MEN(),
        "WS353": fetch_WS353(),
        "WS353R": fetch_WS353(which="relatedness"),
        "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "RW": fetch_RW(),
        "RG65": fetch_RG65(),
        "MTurk": fetch_MTurk()
        # "KORE": fetch_Core()
    }

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(
            name, similarity_results[name]))

    # Calculate results on analogy
    if fastText_ML == True:
        logger.info(
            "Calculating analogy benchmarks with multilingual analogy tasks")
        analogy_tasks = {
            # TODO: Add Cs, Zh, Dem, It.
            "Google": fetch_google_analogy(),
            "MSR": fetch_msr_analogy(),
            "Fi": fetch_finish_analogy(),
            "Cs": fetch_czech_analogy(),
            "Zh": fetch_chinese_analogy(),
            "De": fetch_german_analogy(),
            "Es": fetch_spanish_analogy(),
            "Pt": fetch_portuguese_analogy(),
            "It": fetch_italy_analogy()
        }

    else:
        logger.info("Calculating analogy benchmarks")
        analogy_tasks = {
            "Google": fetch_google_analogy(),
            "MSR": fetch_msr_analogy()
        }

    analogy_results = {}

    for name, data in iteritems(analogy_tasks):
        analogy_results[name] = evaluate_analogy(w, data.X, data.y)
        logger.info("Analogy prediction accuracy on {} {}".format(
            name, analogy_results[name]))

    analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format(
        "SemEval2012", analogy_results["SemEval2012_2"]))

    # Calculate results on categorization
    logger.info("Calculating categorization benchmarks")
    categorization_tasks = {
        "AP": fetch_AP(),
        "BLESS": fetch_BLESS(),
        "Battig": fetch_battig(),
        "ESSLI_2c": fetch_ESSLI_2c(),
        "ESSLI_2b": fetch_ESSLI_2b(),
        "ESSLI_1a": fetch_ESSLI_1a()
    }

    categorization_results = {}

    # Calculate results using helper function
    for name, data in iteritems(categorization_tasks):
        categorization_results[name] = evaluate_categorization(
            w, data.X, data.y)
        logger.info("Cluster purity on {} {}".format(
            name, categorization_results[name]))

    # Construct pd table
    cat = pd.DataFrame([categorization_results])
    print(cat)
    analogy = pd.DataFrame([analogy_results])
    print(analogy)
    sim = pd.DataFrame([similarity_results])
    print(sim)
    results = cat.join(sim).join(analogy)

    # Add Kore Evaluation result if entity_benchmark is True.
    if entity_benchmark == True:
        kore_results = evaluate_on_Kore(w)
        kore = pd.DataFrame([kore_results])
        results = results.join(kore)

    return results