def _load_embeddings(self): self.w = load_embedding(self.fname, format=self.format, normalize=True, lower=False, clean_words=False, load_kwargs={})
def benchmark(filepath, savepath, save_file_name): from web.embeddings import load_embedding w = load_embedding(filepath, format='word2vec') # out_fname = os.path.join(savepath) + save_file_name + "_results.csv" results = evaluate.evaluate_on_all(w) # print(results['AP'].item()) for ind in results.keys(): print(ind,results[ind].item())
def _prepare_embeddings(fname, format, kwargs={}): """ read embeddings from original file, and store in pickle file :param fname: filename :param format: word2vec_binary, glove :param kwargs: needed in case format=='glove'; dict with keys 'dim' and 'vocab_size' """ fname_out = '.'.join(fname.split('.')[:-1]) + '.pkl' emb = we.load_embedding(fname, format, normalize=False, lower=True, clean_words=False, load_kwargs=kwargs) pickle.dump(emb, open(fname_out, 'wb')) print('wrote embeddings to ', fname_out) return emb
def call_module(g_filename): # Configure logging logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') # Fetch GloVe embedding (warning: it might take few minutes) #w_glove = fetch_GloVe(corpus="wiki-6B", dim=300) kargs = {'vocab_size':200000, 'dim':400} fname=g_filename w_custom = load_embedding(fname, format="glove", normalize=True, lower=True, clean_words=False, load_kwargs=kargs) # Define tasks tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "SIMLEX999": fetch_SimLex999() } # Print sample data for name, data in iteritems(tasks): print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0])) # Calculate results using helper function for name, data in iteritems(tasks): print("Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_custom, data.X, data.y)))
_, ext = os.path.splitext(fname) if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": load_kwargs['vocab_size'] = sum(1 for line in open(fname)) load_kwargs['dim'] = len(next(open(fname)).split()) - 1 w = load_embedding(fname, format=format, normalize=True, lower=False, clean_words=options.clean_words, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w) logger.info("Saving results...") print(results) results.to_csv(out_fname)
import argparse import pickle from web.datasets.similarity import fetch_WS353 from web.embeddings import load_embedding from web.evaluate import evaluate_similarity if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--word-vectors') args = parser.parse_args() ws353 = fetch_WS353() embedding = load_embedding(args.word_vectors, lower=True, clean_words=True, format='dict') print('Spearman`s rank on WS353 ', evaluate_similarity(embedding, ws353.X, ws353.y))
if not is_dict_embedding and not args.vocab_size: raise ValueError("GloVe embeddings require a vocab size") if args.lowercase or args.normalize: raise NotImplementedError('Bug') kwargs_emb = {} if is_dict_embedding: #unavailable for GloVe kwargs_emb = {"normalize": args.normalize, "lowercase": args.lowercase} else: kwargs_emb = {"dim": 300, "vocab_size": args.vocab_size} emb = load_embedding(args.emb_filename, format=args.emb_format, load_kwargs=kwargs_emb, lowercase_if_OOV=False, lemmatize_if_OOV=False, normalize=False) model_name = args.emb_filename.split('/')[-2] # TODO: need to feed dim and vocab_size? or useless? vocab_defs, dict_, test_dict = None, None, None if is_dict_embedding: vocab_defs = Vocabulary(vocab_defs_fname) fname_dict = os.path.join(args.root_dicts, "all.json") fname_test_dict = os.path.join(args.root_dicts, "test.json") dict_ = load_dict(fname_dict) test_dict = load_dict(fname_test_dict) dirname = os.path.join('results/figures/', model_name)
#first argument is count method (sys.argv[1]) #second argument is weight method (sys.argv[2]) # Configure logging logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') # Fetch GloVe embedding (warning: it might take few minutes) #w_glove = fetch_GloVe(corpus="wiki-6B", dim=300) kargs = {'vocab_size': 200000, 'dim': 400} fname = '/home/student/Desktop/paper_1/hadoop-1.2.1/1_sparse_matrix/' + str( sys.argv[2]) + '_' + str(sys.argv[1]) + '_' + 'embeddings' w_custom = load_embedding(fname, format="glove", normalize=True, lower=True, clean_words=False, load_kwargs=kargs) # Define tasks tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "SIMLEX999": fetch_SimLex999() } # Print sample data for name, data in iteritems(tasks): print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}". format(name, data.X[0][0], data.X[0][1], data.y[0])) # Calculate results using helper function
for wordfile in wordfiles: format = 'word2vec_bin' embeddings_name = wordfile if wordfile.endswith('.dict.pickle'): # os.path.basename(fname) embeddings_name = wordfile.replace('.dict.pickle', '') format = 'dict' printflush('Processing ' + wordfile + ' ') if computed is not None and embeddings_name in computed: print('[already computed]') else: printflush('[loading...') w = load_embedding(join(dname, wordfile), format=format, normalize=True, lower=True, clean_words=options.clean_words, name=embeddings_name) printflush('OK][evaluating...') embedding_results = evaluate_on_selection(w) metadata = get_metadata_from_name(embeddings_name) embedding_results = embedding_results.join(metadata) if results is not None: results = results.append(embedding_results) else: results = embedding_results print('OK]') results.to_csv(out_fname, index_label='embeddings', index=False) #just in case print("Done")
format = "dict" assert format in [ 'word2vec_bin', 'word2vec', 'glove', 'bin', 'dict', 'dict_poly' ], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 load_kwargs = {'dim': dim, 'vocab_size': vocab_size} w = load_embedding(fname, format=format, normalize=options.normalize, clean_words=options.clean_words, lower=options.lowercase, lowercase_if_OOV=options.lower_or_lemma, lemmatize_if_OOV=options.lower_or_lemma, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" if options.multi_prototype: results = evaluate_on_all_multi(w, options.model) else: results = evaluate_on_all(w, only_sim_rel=options.only_sim_rel) logger.info("Saving results...") print(results) results.to_csv(out_fname)
import logging from web.datasets.analogy import fetch_google_analogy from web.embeddings import fetch_SG_GoogleNews import numpy as np import _pickle as pickle from web.embeddings import load_embedding w = load_embedding("tf_w2vec_dict.p", format="dict") data = fetch_google_analogy() from web.evaluate import evaluate_on_all out_fname = "tf_results.csv" results = evaluate_on_all(w) print("Saving results...") print(results) results.to_csv(out_fname)
if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" # assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 load_kwargs = {"vocab_size": vocab_size, "dim": dim} w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, load_kwargs=load_kwargs, nonorm=options.nonorm, nocheat=options.nocheat) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w, options.nocheat) logger.info("Saving results...") print(results) results.to_csv(out_fname)
fname = os.path.join(_get_dataset_dir(), fname) format = options.format if not format: _, ext = os.path.splitext(fname) if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict" assert format in ['word2vec_bin', 'word2vec', 'glove', 'bin'], "Unrecognized format" load_kwargs = {} if format == "glove": vocab_size = sum(1 for line in open(fname)) dim = len(next(open(fname)).split()) - 1 w = load_embedding(fname, format=format, normalize=True, lower=True, clean_words=options.clean_words, load_kwargs=load_kwargs) out_fname = options.output if options.output else "results.csv" results = evaluate_on_all(w) logger.info("Saving results...") print(results) results.to_csv(out_fname)