def _load_embeddings(self):
     self.w = load_embedding(self.fname,
                             format=self.format,
                             normalize=True,
                             lower=False,
                             clean_words=False,
                             load_kwargs={})
Exemplo n.º 2
0
def benchmark(filepath, savepath, save_file_name):
    from web.embeddings import load_embedding

    w = load_embedding(filepath, format='word2vec')

    # out_fname = os.path.join(savepath) + save_file_name + "_results.csv"

    results = evaluate.evaluate_on_all(w)

    # print(results['AP'].item())

    for ind in results.keys():
        print(ind,results[ind].item())
Exemplo n.º 3
0
def _prepare_embeddings(fname, format, kwargs={}):
    """
    read embeddings from original file, and store in pickle file
    :param fname: filename
    :param format: word2vec_binary, glove 
    :param kwargs: needed in case format=='glove'; dict with keys 'dim' and 'vocab_size'
    """
    fname_out = '.'.join(fname.split('.')[:-1]) + '.pkl'
    emb = we.load_embedding(fname,
                            format,
                            normalize=False,
                            lower=True,
                            clean_words=False,
                            load_kwargs=kwargs)
    pickle.dump(emb, open(fname_out, 'wb'))
    print('wrote embeddings to ', fname_out)
    return emb
Exemplo n.º 4
0
def call_module(g_filename):
  # Configure logging
  logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S')

  # Fetch GloVe embedding (warning: it might take few minutes)
  #w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
  kargs = {'vocab_size':200000, 'dim':400}
  fname=g_filename
  w_custom = load_embedding(fname, format="glove", normalize=True,
                   lower=True, clean_words=False, load_kwargs=kargs)
  # Define tasks
  tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
  }

  # Print sample data
  for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0]))

  # Calculate results using helper function
  for name, data in iteritems(tasks):
    print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_custom, data.X, data.y)))
            _, ext = os.path.splitext(fname)
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

        assert format in ['word2vec_bin', 'word2vec', 'glove',
                          'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            load_kwargs['vocab_size'] = sum(1 for line in open(fname))
            load_kwargs['dim'] = len(next(open(fname)).split()) - 1

        w = load_embedding(fname,
                           format=format,
                           normalize=True,
                           lower=False,
                           clean_words=options.clean_words,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
Exemplo n.º 6
0
import argparse
import pickle

from web.datasets.similarity import fetch_WS353
from web.embeddings import load_embedding
from web.evaluate import evaluate_similarity

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--word-vectors')
    args = parser.parse_args()

    ws353 = fetch_WS353()

    embedding = load_embedding(args.word_vectors,
                               lower=True,
                               clean_words=True,
                               format='dict')
    print('Spearman`s rank on WS353 ',
          evaluate_similarity(embedding, ws353.X, ws353.y))
Exemplo n.º 7
0
    if not is_dict_embedding and not args.vocab_size:
        raise ValueError("GloVe embeddings require a vocab size")

    if args.lowercase or args.normalize:
        raise NotImplementedError('Bug')

    kwargs_emb = {}
    if is_dict_embedding: #unavailable for GloVe
        kwargs_emb = {"normalize": args.normalize,
                      "lowercase": args.lowercase}
    else:
        kwargs_emb = {"dim": 300,
                      "vocab_size": args.vocab_size}
    emb = load_embedding(args.emb_filename, format=args.emb_format,
                         load_kwargs=kwargs_emb, lowercase_if_OOV=False, 
                         lemmatize_if_OOV=False, normalize=False)


    model_name = args.emb_filename.split('/')[-2]
    # TODO: need to feed dim and vocab_size? or useless?
  
    vocab_defs, dict_, test_dict = None, None, None
    if is_dict_embedding:
        vocab_defs = Vocabulary(vocab_defs_fname)
        fname_dict = os.path.join(args.root_dicts, "all.json")
        fname_test_dict = os.path.join(args.root_dicts, "test.json")
        dict_ = load_dict(fname_dict)
        test_dict = load_dict(fname_test_dict)

    dirname = os.path.join('results/figures/', model_name)
Exemplo n.º 8
0
#first argument is count method (sys.argv[1])
#second argument is weight method (sys.argv[2])

# Configure logging
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s',
                    level=logging.DEBUG,
                    datefmt='%I:%M:%S')

# Fetch GloVe embedding (warning: it might take few minutes)
#w_glove = fetch_GloVe(corpus="wiki-6B", dim=300)
kargs = {'vocab_size': 200000, 'dim': 400}
fname = '/home/student/Desktop/paper_1/hadoop-1.2.1/1_sparse_matrix/' + str(
    sys.argv[2]) + '_' + str(sys.argv[1]) + '_' + 'embeddings'
w_custom = load_embedding(fname,
                          format="glove",
                          normalize=True,
                          lower=True,
                          clean_words=False,
                          load_kwargs=kargs)
# Define tasks
tasks = {
    "MEN": fetch_MEN(),
    "WS353": fetch_WS353(),
    "SIMLEX999": fetch_SimLex999()
}

# Print sample data
for name, data in iteritems(tasks):
    print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".
          format(name, data.X[0][0], data.X[0][1], data.y[0]))

# Calculate results using helper function
Exemplo n.º 9
0
    for wordfile in wordfiles:
        format = 'word2vec_bin'
        embeddings_name = wordfile
        if wordfile.endswith('.dict.pickle'):
            # os.path.basename(fname)
            embeddings_name = wordfile.replace('.dict.pickle', '')
            format = 'dict'

        printflush('Processing ' + wordfile + ' ')
        if computed is not None and embeddings_name in computed:
            print('[already computed]')
        else:
            printflush('[loading...')
            w = load_embedding(join(dname, wordfile),
                               format=format,
                               normalize=True,
                               lower=True,
                               clean_words=options.clean_words,
                               name=embeddings_name)
            printflush('OK][evaluating...')
            embedding_results = evaluate_on_selection(w)
            metadata = get_metadata_from_name(embeddings_name)
            embedding_results = embedding_results.join(metadata)
            if results is not None:
                results = results.append(embedding_results)
            else:
                results = embedding_results
            print('OK]')
            results.to_csv(out_fname, index_label='embeddings',
                           index=False)  #just in case

    print("Done")
Exemplo n.º 10
0
                format = "dict"

        assert format in [
            'word2vec_bin', 'word2vec', 'glove', 'bin', 'dict', 'dict_poly'
        ], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1
            load_kwargs = {'dim': dim, 'vocab_size': vocab_size}

        w = load_embedding(fname,
                           format=format,
                           normalize=options.normalize,
                           clean_words=options.clean_words,
                           lower=options.lowercase,
                           lowercase_if_OOV=options.lower_or_lemma,
                           lemmatize_if_OOV=options.lower_or_lemma,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    if options.multi_prototype:
        results = evaluate_on_all_multi(w, options.model)
    else:
        results = evaluate_on_all(w, only_sim_rel=options.only_sim_rel)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
import logging
from web.datasets.analogy import fetch_google_analogy
from web.embeddings import fetch_SG_GoogleNews
import numpy as np
import _pickle as pickle
from web.embeddings import load_embedding
w = load_embedding("tf_w2vec_dict.p", format="dict")
data = fetch_google_analogy()
from web.evaluate import evaluate_on_all
out_fname = "tf_results.csv"
results = evaluate_on_all(w)
print("Saving results...")
print(results)
results.to_csv(out_fname)
Exemplo n.º 12
0
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

#		assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1
            load_kwargs = {"vocab_size": vocab_size, "dim": dim}
        w = load_embedding(fname,
                           format=format,
                           normalize=True,
                           lower=True,
                           clean_words=options.clean_words,
                           load_kwargs=load_kwargs,
                           nonorm=options.nonorm,
                           nocheat=options.nocheat)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w, options.nocheat)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)
            fname = os.path.join(_get_dataset_dir(), fname)

        format = options.format

        if not format:
            _, ext = os.path.splitext(fname)
            if ext == ".bin":
                format = "word2vec_bin"
            elif ext == ".txt":
                format = "word2vec"
            elif ext == ".pkl":
                format = "dict"

        assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format"

        load_kwargs = {}
        if format == "glove":
            vocab_size = sum(1 for line in open(fname))
            dim = len(next(open(fname)).split()) - 1

        w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words,
                           load_kwargs=load_kwargs)

    out_fname = options.output if options.output else "results.csv"

    results = evaluate_on_all(w)

    logger.info("Saving results...")
    print(results)
    results.to_csv(out_fname)