def choose_pretrain_embeddings(model_name, norm=False): if model_name == "GLOVE": embeddings = fetch_GloVe(corpus="wiki-6B", dim=300) elif model_name == "FASTTEXT": embeddings = fetch_FastText(normalize=norm) elif model_name == "SKIPGRAM": embeddings = fetch_SG_GoogleNews(normalize=norm) elif model_name == "LEXVEC": embeddings = fetch_LexVec(normalize=norm) elif model_name == "HPCA": embeddings = fetch_HPCA(normalize=norm) elif model_name == "HDC": embeddings = fetch_HDC(normalize=norm) return embeddings
parser.add_option( "-c", "--clean_words", dest="clean_words", help= "Clean_words argument passed to load_embedding function. If set to True will remove" "most of the non-alphanumeric characters, which should speed up evaluation.", default=False) if __name__ == "__main__": (options, args) = parser.parse_args() # Load embeddings fname = options.filename if not fname: w = fetch_GloVe(corpus="wiki-6B", dim=300) else: if not os.path.isabs(fname): fname = os.path.join(_get_dataset_dir(), fname) format = options.format if not format: _, ext = os.path.splitext(fname) if ext == ".bin": format = "word2vec_bin" elif ext == ".txt": format = "word2vec" elif ext == ".pkl": format = "dict"
datefmt='%I:%M:%S') if args.incremental: embeddings = ["SKIPGRAM", "GLOVE", "FASTTEXT", "LEXVEC", "HPCA", "HDC"] word_embeddings = dict(zip(list(embeddings), [None] * len(embeddings))) unique_words = get_unique_words(tasks) for i, model_name in enumerate(word_embeddings.keys()): embeddings = choose_pretrain_embeddings(model_name) word_embeddings = get_wordvec_dictionary(unique_words, model_name, word_embeddings, embeddings) del embeddings if args.save_vecs != None: save_embeddings(word_embeddings, 'analogy_word_embeddings') else: # Fetch GloVe embedding (warning: it might take few minutes) embeddings = { "GLOVE": fetch_GloVe(corpus="wiki-6B", dim=300), "FASTTEXT": fetch_FastText(normalize=True), "SKIPGRAM": fetch_SG_GoogleNews(normalize=True), "LEXVEC": fetch_LexVec(), } if "MEN" in tasks or "MSR" in tasks: get_wordvec_dictionary(tasks, embeddings, save=args.ttype) else: get_downstream_dictionary(tasks, embeddings, save=args.ttype)
# -*- coding: utf-8 -*- """ Simple example showing evaluating embedding on similarity datasets """ import logging from six import iteritems from web.datasets.similarity import fetch_MEN, fetch_WS353, fetch_SimLex999 from web.embeddings import fetch_GloVe from web.similarity import evaluate_similarity # Configure logging logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.DEBUG, datefmt='%I:%M:%S') # Fetch GloVe embedding (warning: it might take few minutes) w_glove = fetch_GloVe(corpus="wiki-6B", dim=300) # Define tasks tasks = { "MEN": fetch_MEN(), "WS353": fetch_WS353(), "SIMLEX999": fetch_SimLex999() } # Print sample data for name, data in iteritems(tasks): print("Sample data from {}: pair \"{}\" and \"{}\" is assigned score {}".format(name, data.X[0][0], data.X[0][1], data.y[0])) # Calculate results using helper function for name, data in iteritems(tasks): print "Spearman correlation of scores on {} {}".format(name, evaluate_similarity(w_glove, data.X, data.y))