def apply(self, matrix_, column_marginal=None): """ Performs epmi weighting. Args: matrix_ (Matrix): Input matrix column_marginal (np.ndarray): column marginals of the core matrix if the matrix is a peripheral matrix Returns: Matrix: the matrix after applying epmi. """ matrix_.assert_positive() row_sum = matrix_.sum(axis = 1) if not column_marginal is None: col_sum = column_marginal else: col_sum = matrix_.sum(axis = 0) total = col_sum.sum() row_sum = nonzero_invert(row_sum) col_sum = nonzero_invert(col_sum) col_sum = col_sum * total matrix_ = matrix_.scale_rows(row_sum) matrix_ = matrix_.scale_columns(col_sum) return matrix_
def apply(self, matrix_, column_marginal=None): """ Performs epmi weighting. Args: matrix_ (Matrix): Input matrix column_marginal (np.ndarray): column marginals of the core matrix if the matrix is a peripheral matrix Returns: Matrix: the matrix after applying epmi. """ matrix_.assert_positive() row_sum = matrix_.sum(axis=1) if not column_marginal is None: col_sum = column_marginal else: col_sum = matrix_.sum(axis=0) total = col_sum.sum() row_sum = nonzero_invert(row_sum) col_sum = nonzero_invert(col_sum) col_sum = col_sum * total matrix_ = matrix_.scale_rows(row_sum) matrix_ = matrix_.scale_columns(col_sum) return matrix_
def _sims_to_matrix(self, vector, matrix_): sims = DotProdSimilarity()._sims_to_matrix(vector, matrix_) vector_norm = vector.norm() row_norms = vector_norm * matrix_.norm(1) row_norms = nonzero_invert(row_norms) return sims.scale_rows(row_norms)
def apply(self, matrix_): if self.criterion == "length": row_norms = matrix_.norm(axis=1) else: row_norms = matrix_.sum(axis=1) inv_row_norm = nonzero_invert(row_norms) matrix_ = matrix_.scale_rows(inv_row_norm) return matrix_
def main(): """ Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix. Smoothing is performed as described in Omer Levy, Yoav Goldberg, and Ido Dagan. 2015. Improving distributional similarity with lessons learned from word embeddings. Trans. ACL, 3. """ # Get the arguments args = docopt( '''Compute the smoothed and shifted (P)PMI matrix from a co-occurrence matrix and save it in pickle format. Usage: ppmi.py [-l] <dsm_prefix> <k> <alpha> <outPath> <dsm_prefix> = the prefix for the input files (.sm for the matrix, .rows and .cols) and output files (.ppmi) <k> = shifting parameter <alpha> = smoothing parameter <outPath> = output path for space Options: -l, --len normalize final vectors to unit length ''') is_len = args['--len'] dsm_prefix = args['<dsm_prefix>'] k = int(args['<k>']) alpha = float(args['<alpha>']) outPath = args['<outPath>'] logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logging.info(__file__.upper()) start_time = time.time() # Get space with sparse matrix dsm = load_pkl_files(dsm_prefix) id2row = dsm.get_id2row() id2column = dsm.get_id2column() # Get probabilities matrix_ = dsm.cooccurrence_matrix matrix_.assert_positive() row_sum = matrix_.sum(axis=1) col_sum = matrix_.sum(axis=0) # Compute smoothed P_alpha(c) smooth_col_sum = np.power(col_sum, alpha) col_sum = smooth_col_sum / smooth_col_sum.sum() # Compute P(w) row_sum = nonzero_invert(row_sum) col_sum = nonzero_invert(col_sum) # Apply epmi weighting (without log) matrix_ = matrix_.scale_rows(row_sum) matrix_ = matrix_.scale_columns(col_sum) # Apply log weighting matrix_.mat.data = np.log(matrix_.mat.data) # Shift values matrix_.mat.data -= np.log(k) # Eliminate negative counts matrix_.mat.data[matrix_.mat.data <= 0] = 0.0 # Eliminate zero counts matrix_.mat.eliminate_zeros() matrix_ = matrix_.get_mat() if is_len: # L2-normalize vectors l2norm1 = linalg.norm(matrix_, axis=1, ord=2) l2norm1[l2norm1 == 0.0] = 1.0 # Convert 0 values to 1 matrix_ /= l2norm1.reshape(len(l2norm1), 1) dsm = Space(SparseMatrix(matrix_), id2row, id2column) # Save the Space object in pickle format save_pkl_files(dsm, outPath + ".ppmi.sm", save_in_one_file=False) logging.info("--- %s seconds ---" % (time.time() - start_time))