def main(we_file, w2i_file, n_files=50): cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) # model.fit(sentences, cc_matrix=cc_matrix, epochs=20) # coordinate descent model.fit( sentences, cc_matrix=cc_matrix, learning_rate=3*10e-5, reg=0.01, epochs=2000, gd=True, use_theano=False ) # gradient descent model.save(we_file)
def main(we_file, w2i_file, n_files=50): cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) # model.fit(sentences, cc_matrix=cc_matrix, epochs=20) # coordinate descent model.fit(sentences, cc_matrix=cc_matrix, learning_rate=3 * 10e-5, reg=0.01, epochs=2000, gd=True, use_theano=False) # gradient descent model.save(we_file)
def main(we_file, w2i_file, use_brown=True, n_files=50): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx)
def main(we_file, w2i_file, use_brown=True, n_files=50): if use_brown: cc_matrix = "cc_matrix_brown.npy" else: cc_matrix = "cc_matrix_%s.npy" % n_files # hacky way of checking if we need to re-load the raw data or not # remember, only the co-occurrence matrix is needed for training if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] # dummy - we won't actually use it else: if use_brown: keep_words = set([ 'king', 'man', 'woman', 'france', 'paris', 'london', 'rome', 'italy', 'britain', 'england', 'french', 'english', 'japan', 'japanese', 'chinese', 'italian', 'australia', 'australian', 'december', 'november', 'june', 'january', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', ]) sentences, word2idx = get_sentences_with_word2idx_limit_vocab(n_vocab=5000, keep_words=keep_words) else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file, 'w') as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) # model.fit(sentences, cc_matrix=cc_matrix, epochs=20) # ALS model.fit( sentences, cc_matrix=cc_matrix, learning_rate=3*10e-5, reg=0.01, epochs=500, gd=True, use_theano=False, use_tensorflow=True, ) model.save(we_file)
def main(we_file, w2i_file, n_files=50): cc_matrix = 'cc_matrix_%s.npy' % n_files if os.path.exists(cc_matrix): with open(w2i_file) as f: word2idx = json.load(f) sentences = [] else: sentences, word2idx = get_wikipedia_data(n_files=n_files, n_vocab=2000) with open(w2i_file) as f: json.dump(word2idx, f) V = len(word2idx) model = Glove(80, V, 10) model.fit(sentences=sentences, cc_matrix=cc_matrix, learning_rate=3 * 10e-5, reg=0.01, epochs=2000, gd=True, use_theano=False) model.save(we_file)