def fit(self, matrix, epochs=5, no_threads=2, verbose=False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int no_threads: number of training threads - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') self.word_vectors = np.random.rand(shape[0], self.no_components) self.word_biases = np.zeros(shape[0], dtype=np.float64) shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) if verbose: print ('Performing %s training epochs ' 'with %s threads') % (epochs, no_threads) for epoch in xrange(epochs): if verbose: print 'Epoch %s' % epoch # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) fit_vectors(self.word_vectors, self.word_biases, matrix.row, matrix.col, matrix.data, shuffle_indices, self.learning_rate, self.max_count, self.alpha, int(no_threads))
def fit(self, matrix, epochs=5, no_threads=2, verbose=False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int no_threads: number of training threads - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') self.word_vectors = ((np.random.rand(shape[0], self.no_components) - 0.5) / self.no_components) self.word_biases = np.zeros(shape[0], dtype=np.float64) self.vectors_sum_gradients = np.ones_like(self.word_vectors) self.biases_sum_gradients = np.ones_like(self.word_biases) shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) if verbose: print('Performing %s training epochs ' 'with %s threads' % (epochs, no_threads)) # initialize lists that will hold the learning rates vectors_gradients = list() biases_gradients = list() for epoch in range(epochs): if verbose: starttime = dt.datetime.now() print('Epoch %s' % epoch) # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) fit_vectors(self.word_vectors, self.vectors_sum_gradients, self.word_biases, self.biases_sum_gradients, matrix.row, matrix.col, matrix.data, shuffle_indices, self.learning_rate, self.max_count, self.alpha, self.max_loss, int(no_threads)) if not np.isfinite(self.word_vectors).all(): raise Exception('Non-finite values in word vectors. ' 'Try reducing the learning rate or the ' 'max_loss parameter.') if verbose: vectors_gradients.append(np.mean([self.learning_rate/np.sqrt(a) for a in self.vectors_sum_gradients])) biases_gradients.append(np.mean(self.learning_rate/np.sqrt(self.biases_sum_gradients))) endtime = dt.datetime.now() print(' Epoch %s took %s minutes' % (epoch, (endtime-starttime).total_seconds() / 60)) if verbose: # show the learning rates plt.plot(vectors_gradients, 'k--', biases_gradients, 'k:') plt.legend(('word vectors', 'word biases')) plt.xlabel('Epoch') plt.ylabel('Mean learning rate') plt.title('Change in mean learning rates across epochs') plt.show()
def fit(self, matrix, epochs=5, no_threads=2, verbose=False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int no_threads: number of training threads - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') self.word_vectors = ( (np.random.rand(shape[0], self.no_components) - 0.5) / self.no_components) self.word_biases = np.zeros(shape[0], dtype=np.float64) self.vectors_sum_gradients = np.ones_like(self.word_vectors) self.biases_sum_gradients = np.ones_like(self.word_biases) shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) if verbose: print('Performing %s training epochs ' 'with %s threads' % (epochs, no_threads)) # initialize lists that will hold the learning rates vectors_gradients = list() biases_gradients = list() for epoch in range(epochs): if verbose: starttime = dt.datetime.now() print('Epoch %s' % epoch) # Shuffle the coocurrence matrix np.random.shuffle(shuffle_indices) fit_vectors(self.word_vectors, self.vectors_sum_gradients, self.word_biases, self.biases_sum_gradients, matrix.row, matrix.col, matrix.data, shuffle_indices, self.learning_rate, self.max_count, self.alpha, self.max_loss, int(no_threads)) if not np.isfinite(self.word_vectors).all(): raise Exception('Non-finite values in word vectors. ' 'Try reducing the learning rate or the ' 'max_loss parameter.') if verbose: vectors_gradients.append( np.mean([ self.learning_rate / np.sqrt(a) for a in self.vectors_sum_gradients ])) biases_gradients.append( np.mean(self.learning_rate / np.sqrt(self.biases_sum_gradients))) endtime = dt.datetime.now() print(' Epoch %s took %s minutes' % (epoch, (endtime - starttime).total_seconds() / 60)) if verbose: # show the learning rates plt.plot(vectors_gradients, 'k--', biases_gradients, 'k:') plt.legend(('word vectors', 'word biases')) plt.xlabel('Epoch') plt.ylabel('Mean learning rate') plt.title('Change in mean learning rates across epochs') plt.show()
def fit(self, matrix, epochs=5, no_threads=2, verbose=False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int no_threads: number of training threads - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') random_state = check_random_state(self.random_state) self.word_vectors = ((random_state.rand(shape[0], self.no_components) - 0.5) / self.no_components) self.word_biases = np.zeros(shape[0], dtype=np.float64) # fixed ones_like to zeros like self.vectors_sum_gradients = np.zeros_like(self.word_vectors) self.biases_sum_gradients = np.zeros_like(self.word_biases) shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) if verbose: print('Performing %s training epochs ' 'with %s threads on %s cooccurrence words' % (epochs, no_threads, matrix.nnz)) progress = tqdm(range(epochs), desc="Epoch 0: start training") for epoch in progress: # Shuffle the coocurrence matrix random_state.shuffle(shuffle_indices) avg_loss = fit_vectors(self.word_vectors, self.vectors_sum_gradients, self.word_biases, self.biases_sum_gradients, matrix.row, matrix.col, matrix.data, shuffle_indices, self.learning_rate, self.max_count, self.alpha, self.max_loss, int(no_threads)) progress.set_description('Epoch {}: average loss {}'.format(epoch, avg_loss)) if not np.isfinite(self.word_vectors).all(): raise Exception('Non-finite values in word vectors. ' 'Try reducing the learning rate or the ' 'max_loss parameter.')
def fit(self, matrix, epochs=5, no_threads=2, verbose=False, wordList = None,save_gap=None, retrain = False, save_vectors = False): """ Estimate the word embeddings. Parameters: - scipy.sparse.coo_matrix matrix: coocurrence matrix - int epochs: number of training epochs - int no_threads: number of training threads - bool verbose: print progress messages if True """ shape = matrix.shape if (len(shape) != 2 or shape[0] != shape[1]): raise Exception('Coocurrence matrix must be square') if not sp.isspmatrix_coo(matrix): raise Exception('Coocurrence matrix must be in the COO format') random_state = check_random_state(self.random_state) if retrain == False: self.word_vectors = ((random_state.rand(shape[0], self.no_components) - 0.5) / self.no_components) self.word_biases = np.zeros(shape[0], dtype=np.float64) self.vectors_sum_gradients = np.ones_like(self.word_vectors) self.biases_sum_gradients = np.ones_like(self.word_biases) shuffle_indices = np.arange(matrix.nnz, dtype=np.int32) if verbose: print('Performing %s training epochs ' 'with %s threads' % (epochs, no_threads)) for epoch in range(epochs): if verbose: print('Epoch %s' % epoch) # Shuffle the coocurrence matrix random_state.shuffle(shuffle_indices) fit_vectors(self.word_vectors, self.vectors_sum_gradients, self.word_biases, self.biases_sum_gradients, matrix.row, matrix.col, matrix.data, shuffle_indices, self.learning_rate, self.max_count, self.alpha, self.max_loss, int(no_threads)) if not np.isfinite(self.word_vectors).all(): raise Exception('Non-finite values in word vectors. ' 'Try reducing the learning rate or the ' 'max_loss parameter.') if epoch%save_gap==0 and wordList is not None: vectorList = [self.word_vectors[self.dictionary[x]].reshape(1,-1) for x in wordList] mat = [] for i in range(len(wordList)): mat.append([]) for j in range(len(wordList)): sim = cosine_similarity(vectorList[i], vectorList[j]) mat[i].append(sim[0][0]) df = pd.DataFrame(mat, columns = wordList, index = wordList) df.to_csv("output_" + str(self.startIndex + epoch) + ".csv") if save_vectors == True: with open("vectors_"+str(self.startIndex + epoch),'wb') as f: pickle.dump(vectorList, f)