def test_inplace_transform_word_OrderedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = OrderedVocabulary(words=['dog', 'cat', '  cat'])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=True)

    assert pe is e and pe == e

    assert len(pe.vocabulary) == 2
    assert len(pe.vectors) == 2

    # 'dog'
    assert [0, 0, 11] in pe.vectors.tolist()
    # 'cat'
    assert [0, 11, 12] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 11])

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 11, 12])

    assert type(pe.vocabulary) == OrderedVocabulary
def get_vector_pairs(w, X, y, dataset='simlex', save=True):

    if isinstance(w, dict):
        w = list(w.values())
        names = list(w.keys())
        w_source = Embedding.from_dict(w[0])
        w_target = Embedding.from_dict(w[1])

    missing_words = 0
    source_words = w['source'].vocabulary.word_id
    for query in X:
        for query_word in query:
            if query_word not in source_words:
                missing_words += 1
    if missing_words > 0:
        logger.warning("Missing {} source words. Will replace them with mean vector".format(missing_words))

    mean_vector_source = np.mean(w['source'].vectors, axis=0, keepdims=True)
    mean_vector_target = np.mean(w['target'].vectors, axis=0, keepdims=True)

    x = list(set(list(X[:,0])+list(X[:,1])))
    x1 = np.vstack(w['source'].get(word, mean_vector_source) for word in x)
    x2 = np.vstack(w['target'].get(word, mean_vector_target) for word in x)

    if save:
        filename = w['source']+'2'+w['target']+ '_'+dataset
        word2vec = {'source':{},'target':{}}

    return (x1, x2)
def test_inplace_transform_word_CountedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = CountedVocabulary(word_count=[(' cat ', 10), ('cat', 50), ('dog', 60)])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 11], [0, 11, 12], [0, 12, 13]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=True)

    assert pe is e and pe == e

    assert len(pe.vocabulary) == 2
    assert len(pe.vectors) == 2

    # 'dog'
    assert [0, 0, 11] in pe.vectors.tolist()
    # 'cat'
    assert [0, 11, 12] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words

    l = pe.vocabulary.getstate()
    d = {l[0][i]: l[1][i] for i in range(len(l[0]))}

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 11])
    assert d['dog'] == 60

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 11, 12])
    assert d['cat'] == 50

    assert type(pe.vocabulary) == CountedVocabulary
def test_noinplace_transform_word_prefer_shortestword2_Vocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = Vocabulary(words=['dog', 'cat', '    pikatchu   ', 'pikatchu', ' cat '])
    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=False)

    assert len(pe.vocabulary) == 3
    assert len(pe.vectors) == 3

    # 'dog'
    assert [0, 0, 1] in pe.vectors.tolist()
    # 'cat'
    assert [0, 1, 11] in pe.vectors.tolist()
    # pikatchu
    assert [0, 12, 13] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words
    assert 'pikatchu' in pe.vocabulary.words

    # pikatchu
    assert pe.vocabulary.words[2] == 'pikatchu'
    assert np.array_equal(pe.vectors[2], [0, 12, 13])

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 1])

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 1, 11])

    assert type(pe.vocabulary) == Vocabulary
def test_save_2():
    dirpath = tempfile.mkdtemp()
    w = ["a", "b", "c"]
    vectors = np.array([[1., 2.], [2., 3.], [3., 4.]])
    e = Embedding(Vocabulary(w), vectors)
    Embedding.to_word2vec(e, path.join(dirpath, "test.bin"), binary=True)
    e2 = Embedding.from_word2vec(path.join(dirpath, "test.bin"), binary=True)
    assert np.array_equal(e2.vectors, vectors)
def test_save():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)

    dirpath = tempfile.mkdtemp()
    w.to_word2vec(w, path.join(dirpath, "tmp.bin"), binary=True)
    w.to_word2vec(w, path.join(dirpath, "tmp.txt"), binary=False)
    w2 = Embedding.from_word2vec(path.join(dirpath, "tmp.bin"), binary=True)
    w3 = Embedding.from_word2vec(path.join(dirpath, "tmp.txt"), binary=False)
    assert np.array_equal(w.vectors, w2.vectors)
    assert w.vocabulary.words == w2.vocabulary.words
    assert np.sum(np.abs(w.vectors - w3.vectors)) < 1e-5
    assert w.vocabulary.words == w3.vocabulary.words
def test_analogy_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_google_analogy()
    ids = np.random.RandomState(777).choice(range(data.X.shape[0]), 1000, replace=False)
    X, y = data.X[ids], data.y[ids]
    category = data.category_high_level[ids]

    results = evaluate_analogy(w=w, X=X, y=y, category=category)
    assert results['accuracy']['all'] >= 0.65
    assert results['accuracy']['semantic'] >= 0.7
    assert results['accuracy']['syntactic'] >= 0.63

    results = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul")
    assert results['accuracy']['all'] >= 0.7
    assert results['accuracy']['semantic'] >= 0.75
    assert results['accuracy']['syntactic'] >= 0.64

    results_mul = evaluate_analogy(w=w, X=X, y=y, category=category, method="mul", k=400)
    results_add = evaluate_analogy(w=w, X=X, y=y, category=category, method="add", k=400)
    assert results_mul['accuracy']['all'] >= results_add['accuracy']['all']
    assert results_mul['accuracy']['syntactic'] >= results_add['accuracy']['syntactic']
    assert results_mul['accuracy']['semantic'] >= results_add['accuracy']['semantic']
def evaluate_similarity(w, X, y, restrict_to_words=None):
    Calculate Spearman correlation between cosine similarity of the model
    and human rated similarity of word pairs

    w : Embedding or dict
      Embedding or dict instance.

    X: array, shape: (n_samples, 2)
      Word pairs

    y: vector, shape: (n_samples,)
      Human ratings

    cor: float
      Spearman correlation

    from web.embedding import Embedding

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    A = np.vstack(w.get(word, mean_vector) for word in X[:, 0])
    B = np.vstack(w.get(word, mean_vector) for word in X[:, 1])
    scores = np.array([v1.dot(v2.T) for v1, v2 in zip(A, B)])
    return scipy.stats.spearmanr(scores, y).correlation
def test_standardize_preserve_identity():
    d = {"Spider": [3, 4, 5], "spider": [1, 2, 3], "spideR": [3, 2, 4]}
    w3 = Embedding.from_dict(d)
    w4 = w3.standardize_words(inplace=False, lower=True)
    assert w4['spider'][0] == 1
    w3.standardize_words(inplace=True, lower=True)
    assert w3['spider'][0] == 1
def evaluate_ana(wv, w2i, vocab):
    W_norm = np.zeros(wv.shape)
    d = (np.sum(wv**2, 1)**(0.5))
    W_norm = (wv.T / d).T

    evaluate_analogy_msr(W_norm, w2i)
    evaluate_analogy_google(W_norm, w2i)

    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = W_norm[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

#     analogy_tasks = {
#         "Google": fetch_google_analogy(),
#         "MSR": fetch_msr_analogy()
#     }

#     analogy_results = {}

#     for name, data in iteritems(analogy_tasks):
#         analogy_results[name] = evaluate_analogy(w, data.X, data.y)
#         print("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))
文件: syngcn.py 项目: zxlzr/WordGCN
	def checkpoint(self, epoch, sess):
		Computes intrinsic scores for embeddings and dumps the embeddings embeddings

		epoch:		Current epoch number
		sess:		Tensorflow session object

		embed_matrix, \
		context_matrix 	= sess.run([self.embed_matrix, self.context_matrix])
		voc2vec 	= {wrd: embed_matrix[wid] for wrd, wid in self.voc2id.items()}
		embedding 	= Embedding.from_dict(voc2vec)
		results		= evaluate_on_all(embedding)
		results 	= {key: round(val[0], 4) for key, val in results.items()}
		curr_int 	= np.mean(list(results.values()))
		self.logger.info('Current Score: {}'.format(curr_int))

		if curr_int > self.best_int_avg:
			self.logger.info("Saving embedding matrix")
			f = open('{}/{}'.format(self.p.emb_dir, self.p.name), 'w')
			for id, wrd in self.id2voc.items():
				f.write('{} {}\n'.format(wrd, ' '.join([str(round(v, 6)) for v in embed_matrix[id].tolist()])))

			self.saver.save(sess=sess, save_path=self.save_path)
			self.best_int_avg = curr_int
def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777,
                                   method="all") >= 0.2
def evaluate_synonyms(e, problems):

    correct = 0
    total = 0

    if not e:
        all_words = np.concatenate([[q] + o for q, o, _ in problems])
        e = Embedding.from_dict({w: np.random.random(10) for w in all_words})

    meanvec = np.mean(e.vectors, axis=0)

    # with open('synonyms_test_words', 'a') as testw:
    for question, options, answer in problems:
        # testw.write('\n'.join(options+[question])+'\n')
        if question in e:
            print('question: ' + question)
            q_v = e[question].reshape(1, -1)
            q_ops = np.vstack(
                [e[op] if op in e else meanvec for op in options])
            distances = cdist(q_v, q_ops, metric='cosine')[0]
            selected = np.argsort(distances)[0]
            if selected == answer:
                correct += 1
        total += 1

    score = correct * 1. / total

    return score
def evaluate_on_all(w):
    Evaluate Embedding on all fast-running benchmarks

    w: Embedding or dict
      Embedding to evaluate.

    results: pandas.DataFrame
      DataFrame with results, one per column.
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {"WS353": fetch_WS353()}

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(
            name, similarity_results[name]))

    sim = pd.DataFrame([similarity_results])
    results = sim

    return results
def evaluate_simi(wv, w2i, vocab):
    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = wv[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)

    # Calculate results on similarity
    print("Calculating similarity benchmarks")
    similarity_tasks = {
        "WS353": fetch_WS353(),
        "RG65": fetch_RG65(),
        #         "WS353R": fetch_WS353(which="relatedness"),
        #         "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "MTurk": fetch_MTurk(),
        "RW": fetch_RW(),
        "MEN": fetch_MEN(),

    #     similarity_results = {}

    for name, data in iteritems(similarity_tasks):
            "Sample data from {}, num of samples: {} : pair \"{}\" and \"{}\" is assigned score {}"
            .format(name, len(data.X), data.X[0][0], data.X[0][1], data.y[0]))
        score = evaluate_similarity(w, data.X, data.y)
        print("Spearman correlation of scores on {} {}".format(name, score))
def evaluate_analogy(w, X, y, method="add", k=None, category=None, batch_size=100):
    Simple method to score embedding using SimpleAnalogySolver

    w : Embedding or dict
      Embedding or dict instance.

    method : {"add", "mul"}
      Method to use when finding analogy answer, see "Improving Distributional Similarity
      with Lessons Learned from Word Embeddings"

    X : array-like, shape (n_samples, 3)
      Analogy questions.

    y : array-like, shape (n_samples, )
      Analogy answers.

    k : int, default: None
      If not None will select k top most frequent words from embedding

    batch_size : int, default: 100
      Increase to increase memory consumption and decrease running time

    category : list, default: None
      Category of each example, if passed function returns accuracy per category
      in addition to the overall performance.
      Analogy datasets have "category" field that can be supplied here.

    result: dict
      Results, where each key is for given category and special empty key "" stores
      summarized accuracy across categories
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert category is None or len(category) == y.shape[0], "Passed incorrect category list"

    solver = SimpleAnalogySolver(w=w, method=method, batch_size=batch_size, k=k)
    y_pred = solver.predict(X)

    if category is not None:
        results = OrderedDict({"all": np.mean(y_pred == y)})
        count = OrderedDict({"all": len(y_pred)})
        correct = OrderedDict({"all": np.sum(y_pred == y)})
        for cat in set(category):
            results[cat] = np.mean(y_pred[category == cat] == y[category == cat])
            count[cat] = np.sum(category == cat)
            correct[cat] = np.sum(y_pred[category == cat] == y[category == cat])

        return pd.concat([pd.Series(results, name="accuracy"),
                          pd.Series(correct, name="correct"),
                          pd.Series(count, name="count")],
        return np.mean(y_pred == y)
def test_standardize():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")

    w = Embedding.from_word2vec(file_name, binary=True)
    w2 = w.standardize_words(inplace=False, lower=False, clean_words=True)
    w3 = Embedding.from_word2vec(file_name, binary=True)
    assert len(w2.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False, clean_words=True):
            assert np.array_equal(w[word], w2[standardize_string(word, lower=False, clean_words=True)])

    w3.standardize_words(inplace=True, clean_words=True, lower=False)
    assert len(w3.words) == 95
    for word in w.vocabulary.words:
        if standardize_string(word, lower=False):
            assert np.array_equal(w[word], w3[standardize_string(word, lower=False, clean_words=True)])
def load_embedding(fname,
    Loads embeddings from file

    fname: string
      Path to file containing embedding

    format: string
      Format of the embedding. Possible values are:
      'word2vec_bin', 'word2vec', 'glove', 'dict'

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.
    assert format in ['word2vec_bin', 'word2vec', 'glove',
                      'dict'], "Unrecognized format"
    if format == "word2vec_bin":
        w = Embedding.from_word2vec(fname, binary=True)
    elif format == "word2vec":
        w = Embedding.from_word2vec(fname, binary=False)
    elif format == "glove":
        w = Embedding.from_glove(fname, **load_kwargs)
    elif format == "dict":
        d = pickle.load(open(fname, "rb"), encoding='latin1')
        w = Embedding.from_dict(d)
    if normalize:
    if lower or clean_words:
        w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
    return w
def evaluate_on_semeval_2012_2(w):
    Simple method to score embedding using SimpleAnalogySolver

    w : Embedding or dict
      Embedding or dict instance.

    result: pandas.DataFrame
      Results with spearman correlation per broad category with special key "all" for summary
      spearman correlation
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_semeval_2012_2()
    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    categories = data.y.keys()
    results = defaultdict(list)
    for c in categories:
        # Get mean of left and right vector
        prototypes = data.X_prot[c]
        prot_left = np.mean(
            np.vstack(w.get(word, mean_vector) for word in prototypes[:, 0]),
        prot_right = np.mean(
            np.vstack(w.get(word, mean_vector) for word in prototypes[:, 1]),

        questions = data.X[c]
        question_left, question_right = (
            np.vstack(w.get(word, mean_vector) for word in questions[:, 0]),
            np.vstack(w.get(word, mean_vector) for word in questions[:, 1]),

        scores = np.dot(prot_left - prot_right,
                        (question_left - question_right).T)

        c_name = data.categories_names[c].split("_")[0]
        # NaN happens when there are only 0s, which might happen for very rare words or
        # very insufficient word vocabulary
        cor = scipy.stats.spearmanr(scores, data.y[c]).correlation
        results[c_name].append(0 if np.isnan(cor) else cor)

    final_results = OrderedDict()
    final_results["all"] = sum(sum(v)
                               for v in results.values()) / len(categories)
    for k in results:
        final_results[k] = sum(results[k]) / len(results[k])
    return pd.Series(final_results)
def test_noinplace_transform_word_prefer_shortestword_CountedVocabulary():
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

    cw = CountedVocabulary(
        word_count=[('dog', 60), ('cat', 50), ('    pikatchu   ', 10), ('pikatchu', 10), (' cat ', 5)])

    e = Embedding(vocabulary=cw, vectors=np.asanyarray([[0, 0, 1], [0, 1, 11], [0, 11, 12], [0, 12, 13], [0, 13, 14]]))
    pe = e.transform_words(lambda x: x.strip(), inplace=False)

    assert len(pe.vocabulary) == 3
    assert len(pe.vectors) == 3

    # 'dog'
    assert [0, 0, 1] in pe.vectors.tolist()
    # 'cat'
    assert [0, 1, 11] in pe.vectors.tolist()
    # pikatchu
    assert [0, 12, 13] in pe.vectors.tolist()

    assert 'cat' in pe.vocabulary.words
    assert 'dog' in pe.vocabulary.words
    assert 'pikatchu' in pe.vocabulary.words

    l = pe.vocabulary.getstate()
    d = {l[0][i]: l[1][i] for i in range(len(l[0]))}

    # pikatchu
    assert pe.vocabulary.words[2] == 'pikatchu'
    assert np.array_equal(pe.vectors[2], [0, 12, 13])
    assert d['pikatchu'] == 10

    # dog
    assert pe.vocabulary.words[0] == 'dog'
    assert np.array_equal(pe.vectors[0], [0, 0, 1])
    assert d['dog'] == 60

    # cat
    assert pe.vocabulary.words[1] == 'cat'
    assert np.array_equal(pe.vectors[1], [0, 1, 11])
    assert d['cat'] == 50

    assert type(pe.vocabulary) == CountedVocabulary
def test_similarity():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    data = fetch_SimLex999()

    result_1 = evaluate_similarity(w, data.X, data.y)
    result_2 =  evaluate_similarity(dict(zip(w.vocabulary.words, w.vectors)), data.X, data.y)

    assert result_2 > 0
    assert result_1 == result_2, "evaluate_similarity should return same result for dict and Embedding instance"
文件: evaluate.py 项目: tombosc/cpae
def evaluate_word_analogy(wv, w2i, vocab):
    W_norm = np.zeros(wv.shape)
    d = (np.sum(wv**2, 1)**(0.5))
    W_norm = (wv.T / d).T

    evaluate_analogy_msr(W_norm, w2i)
    evaluate_analogy_google(W_norm, w2i)

    wv_dict = dict()
    for w in vocab:
        wv_dict[w] = W_norm[w2i[w], :]

    if isinstance(wv_dict, dict):
        w = Embedding.from_dict(wv_dict)
def evaluate(embed_matrix: dict, voc2id: dict) -> np.float:
    Computes intrinsic scores for embeddings and dumps the embeddings embeddings
    epoch:        Current epoch number
    sess:        Tensorflow session object
    voc2vec = {wrd: embed_matrix[wid] for wrd, wid in voc2id.items()}
    embedding = Embedding.from_dict(voc2vec)
    results = evaluate_on_all(embedding)
    results = {key: round(val[0], 4) for key, val in results.items()}
    curr_int = np.mean(list(results.values()))
    return curr_int
def evaluate_categorization(w, X, y, method="all", seed=None):
    Evaluate embeddings on categorization task.

    w: Embedding or dict
      Embedding to test.

    X: vector, shape: (n_samples, )
      Vector of words.

    y: vector, shape: (n_samples, )
      Vector of cluster assignments.

    method: string, default: "all"
      What method to use. Possible values are "agglomerative", "kmeans", "all.
      If "agglomerative" is passed, method will fit AgglomerativeClustering (with very crude
      hyperparameter tuning to avoid overfitting).
      If "kmeans" is passed, method will fit KMeans.
      In both cases number of clusters is preset to the correct value.

    seed: int, default: None
      Seed passed to KMeans.

    purity: float
      Purity of the best obtained clustering.

    KMedoids method was excluded as empirically didn't improve over KMeans (for categorization
    tasks available in the package).

    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    assert method in ["all", "kmeans", "agglomerative"], "Uncrecognized method"

    mean_vector = np.mean(w.vectors, axis=0, keepdims=True)
    w.oov = 0
    words = np.vstack(w.get(word, mean_vector) for word in X.flatten())
    print ('{} oov words out of {}'.format(w.oov, len(X.flatten())))
    ids = np.random.RandomState(seed).choice(range(len(X)), len(X), replace=False)

    # Evaluate clustering on several hyperparameters of AgglomerativeClustering and
    # KMeans
    best_purity = 0

    if method == "all" or method == "agglomerative":
        best_purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
        logger.debug("Purity={:.3f} using affinity={} linkage={}".format(best_purity, 'euclidean', 'ward'))
        for affinity in ["cosine", "euclidean"]:
            for linkage in ["average", "complete"]:
                purity = calculate_purity(y[ids], AgglomerativeClustering(n_clusters=len(set(y)),
                logger.debug("Purity={:.3f} using affinity={} linkage={}".format(purity, affinity, linkage))
                best_purity = max(best_purity, purity)

    if method == "all" or method == "kmeans":
        purity = calculate_purity(y[ids], KMeans(random_state=seed, n_init=10, n_clusters=len(set(y))).
        logger.debug("Purity={:.3f} using KMeans".format(purity))
        best_purity = max(purity, best_purity)

    return best_purity
def evaluate_on_all(w):
    Evaluate Embedding on all fast-running benchmarks

    w: Embedding or dict
      Embedding to evaluate.

    results: pandas.DataFrame
      DataFrame with results, one per column.
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    # Calculate results on similarity
    logger.info("Calculating similarity benchmarks")
    similarity_tasks = {
        "MEN": fetch_MEN(),
        "WS353": fetch_WS353(),
        "WS353R": fetch_WS353(which="relatedness"),
        "WS353S": fetch_WS353(which="similarity"),
        "SimLex999": fetch_SimLex999(),
        "RW": fetch_RW(),
        "RG65": fetch_RG65(),
        "MTurk": fetch_MTurk(),

    similarity_results = {}

    for name, data in iteritems(similarity_tasks):
        similarity_results[name] = evaluate_similarity(w, data.X, data.y)
        logger.info("Spearman correlation of scores on {} {}".format(name, similarity_results[name]))

    # Calculate results on analogy
    logger.info("Calculating analogy benchmarks")
    analogy_tasks = {
        "Google": fetch_google_analogy(),
        "MSR": fetch_msr_analogy()

    analogy_results = {}

    for name, data in iteritems(analogy_tasks):
        analogy_results[name] = evaluate_analogy(w, data.X, data.y)
        logger.info("Analogy prediction accuracy on {} {}".format(name, analogy_results[name]))

    analogy_results["SemEval2012_2"] = evaluate_on_semeval_2012_2(w)['all']
    logger.info("Analogy prediction accuracy on {} {}".format("SemEval2012", analogy_results["SemEval2012_2"]))

    # Calculate results on categorization
    logger.info("Calculating categorization benchmarks")
    categorization_tasks = {
        "AP": fetch_AP(),
        "BLESS": fetch_BLESS(),
        "Battig": fetch_battig(),
        "ESSLI_2c": fetch_ESSLI_2c(),
        "ESSLI_2b": fetch_ESSLI_2b(),
        "ESSLI_1a": fetch_ESSLI_1a()

    categorization_results = {}

    # Calculate results using helper function
    for name, data in iteritems(categorization_tasks):
        categorization_results[name] = evaluate_categorization(w, data.X, data.y)
        logger.info("Cluster purity on {} {}".format(name, categorization_results[name]))

    # Construct pd table
    cat = pd.DataFrame([categorization_results])
    analogy = pd.DataFrame([analogy_results])
    sim = pd.DataFrame([similarity_results])
    results = cat.join(sim).join(analogy)

    return results
def evaluate_on_WordRep(w, max_pairs=1000, solver_kwargs={}):
    Evaluate on WordRep dataset

    w : Embedding or dict
      Embedding or dict instance.

    max_pairs: int, default: 1000
      Each category will be constrained to maximum of max_pairs pairs
      (which results in max_pair * (max_pairs - 1) examples)

    solver_kwargs: dict, default: {}
      Arguments passed to SimpleAnalogySolver. It is suggested to limit number of words
      in the dictionary.

    Bin Gao, Jiang Bian, Tie-Yan Liu (2015)
     "WordRep: A Benchmark for Research on Learning Word Representations"
    if isinstance(w, dict):
        w = Embedding.from_dict(w)

    data = fetch_wordrep()
    categories = set(data.category)

    accuracy = {}
    correct = {}
    count = {}
    for cat in categories:
        X_cat = data.X[data.category == cat]
        X_cat = X_cat[0:max_pairs]

        logger.info("Processing {} with {} pairs, {} questions".format(cat, X_cat.shape[0]
                                                                       , X_cat.shape[0] * (X_cat.shape[0] - 1)))

        # For each category construct question-answer pairs
        size = X_cat.shape[0] * (X_cat.shape[0] - 1)
        X = np.zeros(shape=(size, 3), dtype="object")
        y = np.zeros(shape=(size,), dtype="object")
        id = 0
        for left, right in product(X_cat, X_cat):
            if not np.array_equal(left, right):
                X[id, 0:2] = left
                X[id, 2] = right[0]
                y[id] = right[1]
                id += 1

        # Run solver
        solver = SimpleAnalogySolver(w=w, **solver_kwargs)
        y_pred = solver.predict(X)
        correct[cat] = float(np.sum(y_pred == y))
        count[cat] = size
        accuracy[cat] = float(np.sum(y_pred == y)) / size

    # Add summary results
    correct['wikipedia'] = sum(correct[c] for c in categories if c in data.wikipedia_categories)
    correct['all'] = sum(correct[c] for c in categories)
    correct['wordnet'] = sum(correct[c] for c in categories if c in data.wordnet_categories)

    count['wikipedia'] = sum(count[c] for c in categories if c in data.wikipedia_categories)
    count['all'] = sum(count[c] for c in categories)
    count['wordnet'] = sum(count[c] for c in categories if c in data.wordnet_categories)

    accuracy['wikipedia'] = correct['wikipedia'] / count['wikipedia']
    accuracy['all'] = correct['all'] / count['all']
    accuracy['wordnet'] = correct['wordnet'] / count['wordnet']

    return pd.concat([pd.Series(accuracy, name="accuracy"),
                      pd.Series(correct, name="correct"),
                      pd.Series(count, name="count")], axis=1)
def test_semeval_solver():
    url = "https://www.dropbox.com/s/rm756kjvckxa5ol/top100-sgns-googlenews-300.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    results = evaluate_on_semeval_2012_2(w)
    assert results['all'] >= 0, "Should have some results on SemEval2012"
def test_wordrep_solver():
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    P = evaluate_on_WordRep(w, max_pairs=2)
    assert P['accuracy']['all'] >= 0
def test_categorization():
    data = fetch_ESSLI_2c()
    url = "https://www.dropbox.com/s/5occ4p7k28gvxfj/ganalogy-sg-wiki-en-400.bin?dl=1"
    file_name = _fetch_file(url, "test")
    w = Embedding.from_word2vec(file_name, binary=True)
    assert evaluate_categorization(w, data.X, data.y, seed=777, method="all") >= 0.2