from embeddings import EmbeddingsDictionary dictionary = EmbeddingsDictionary(100000) neighbors_geek = dictionary.w2neighbors('geek', 10) neighbors_man = dictionary.w2neighbors('man', 10) neighbors_woman = dictionary.w2neighbors('woman', 10) dictionary.analogy('ai', 'human', 'concert')
from embeddings import EmbeddingsDictionary emb = EmbeddingsDictionary( max_words=200000, path='/Users/nmanzini/goinfre/wiki-news-300d-1M.vec', normalize=True, word_whitelist=None) # print("index of'people'") # print(emb.dictionary['people']) # print("w2neighbors of 'people'") # print(emb.w2neighbors('people',10)) # print("emb.words[65]") # print(emb.words[65]) # print(emb.w2neighbors('geek',10)) # a = emb.emb[emb.dictionary['London']] # b = emb.emb[emb.dictionary['day']] # c = emb.emb[emb.dictionary['rain']] # query = a + b - c # _scores, lst_closest = emb.emb2neighbors(query) # print("emb.emb2neighbors(query)") # for word in lst_closest: # print(emb.words[word],end=', ')
from embeddings import EmbeddingsDictionary emb = EmbeddingsDictionary(100000) #1 print('#1') print(emb.w2neighbors('geek', 10)) print() #2 print('#2') emb.analogy('King', 'woman', 'man') print() emb.analogy('sushi', 'Rome', 'Tokyo') print() emb.analogy('uncle', 'woman', 'man') print() emb.analogy('puppy', 'cat', 'dog')
from embeddings import EmbeddingsDictionary emb = EmbeddingsDictionary(100000) print(emb.w2neighbors('geek'))
from model import BowModel logger = logging.getLogger() # on met le niveau du logger à DEBUG, comme ça il écrit tout logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: %(message)s', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) console.setLevel(logging.INFO) logger.addHandler(console) # Here we load only a small chunk of the embeddings (100k most common words) # You can change it if you want all_words = set(line.strip() for line in open('all_sst_words.txt')) emb_dict = EmbeddingsDictionary(word_whitelist=all_words) data = SifDataset() train_exs, train_labels, train_freq = dataset.preprocess_dataset( data.train, emb_dict.dictionary) logging.info('Loaded train, size={}, npos={}'.format(len(train_exs), sum(train_labels).sum())) dev_exs, dev_labels, dev_freq = dataset.preprocess_dataset( data.dev, emb_dict.dictionary) logging.info('Loaded dev, size={}, npos={}'.format(len(dev_exs), sum(dev_labels).sum())) model = BowModel(emb_dict.emb, train_freq) loss_fn = nn.NLLLoss() optimized_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(optimized_params, lr=0.003)
logger = logging.getLogger() # on met le niveau du logger à DEBUG, comme ça il écrit tout logger.setLevel(logging.INFO) fmt = logging.Formatter('%(asctime)s: %(message)s', '%m/%d/%Y %I:%M:%S %p') console = logging.StreamHandler() console.setFormatter(fmt) console.setLevel(logging.INFO) logger.addHandler(console) # Here we load only a small chunk of the embeddings (100k most common words) # You can change it if you want all_words = set(line.strip() for line in open('all_sst_words.txt')) emb_dict = EmbeddingsDictionary( word_whitelist=all_words, path='data/wiki-news-300d-1M.vec' ) # emb_dict = EmbeddingsDictionary(word_whitelist=all_words) data = SifDataset() train_exs, train_labels = dataset.preprocess_dataset(data.train, emb_dict.dictionary) logging.info('Loaded train, size={}, npos={}'.format(len(train_exs), sum(train_labels).sum())) dev_exs, dev_labels = dataset.preprocess_dataset(data.dev, emb_dict.dictionary) logging.info('Loaded dev, size={}, npos={}'.format(len(dev_exs), sum(dev_labels).sum())) model = BowModel(emb_dict.emb) loss_fn = nn.NLLLoss() optimized_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = optim.Adam(optimized_params, lr=0.003)
from embeddings import EmbeddingsDictionary emb = EmbeddingsDictionary(250000) print emb.w2neighbors("geek", 10) query_embedding = emb.embed("Facebook") + emb.embed("Google") emb.analogy('fifty-five', 'five', 'twenty')