def op_union_indices(op: Callable, a: sparse.csr_matrix, b: sparse.csr_matrix, *, default_value=0): assert a.shape == b.shape if type(a) != type(b): b = type(a)(b) a.sort_indices() b.sort_indices() # TODO: numpy is weird with bools here out_dtype = np.array(op(a.data[0], b.data[0])).dtype default_value = out_dtype.type(default_value) return type(a)( op_union_indices_csr_csr( op, a.indptr, a.indices, a.data, b.indptr, b.indices, b.data, out_dtype=out_dtype, default_value=default_value, ), a.shape, )
def csr_to_rmat(csr: sparse.csr_matrix): csr.sort_indices() t, conv_data, _ = get_type_conv(csr.dtype) return methods.new( f"{t}gRMatrix", j=as_integer(csr.indices), p=as_integer(csr.indptr), x=conv_data(csr.data), Dim=as_integer(list(csr.shape)), )
def _X_to_df(self, X: sps.csr_matrix, user_ids: List[Any]) -> pd.DataFrame: if self.item_ids is None: raise RuntimeError( "Setting item_ids is required to use this method.") X.sort_indices() row, col = X.nonzero() data = X.data return pd.DataFrame( dict( user_id=[user_ids[r] for r in row], item_id=[self.item_ids[c] for c in col], rating=data, ))
def layout(self, transformed: np.ndarray, initial_pos: np.ndarray = None, knn: sparse.csr_matrix = None) -> np.ndarray: """ Compute Barnes-Hut approximate t-SNE layout Args: transformed: The (typically) PCA-transformed input data, shape: (n_samples, n_components) n_dims: 2 or 3 initial_pos: Initial layout, or None to use the first components of 'transformed' knn: Precomputed knn graph in sparse matrix format, or None to use Gaussian perplexity Remarks: Requires 'bh_tsne' to be available on the $PATH """ n_cells = transformed.shape[0] n_components = transformed.shape[1] nnz = 0 if initial_pos is None: initial_pos = transformed[:, :self.n_dims] if knn is not None: # knn = knn.tocsr().maximum(knn.transpose()) # knn = knn.multiply(1 / knn.sum(axis=1)).tocsr() knn.sort_indices() nnz = knn.nnz with tempfile.TemporaryDirectory() as td: with open(os.path.join(td, 'data.dat'), 'wb') as data_file: # Write the bh_tsne header data_file.write( pack('=iiiddii', n_cells, n_components, nnz, self.theta, self.perplexity, self.n_dims, self.max_iter)) # Write the initial positions for ix in range(n_cells): pos = initial_pos[ix, :] data_file.write(pack('={}d'.format(pos.shape[0]), *pos)) if nnz != 0: data_file.write( pack('={}i'.format(knn.indptr.shape[0]), *knn.indptr)) data_file.write( pack('={}i'.format(knn.indices.shape[0]), *knn.indices)) data_file.write( pack('={}d'.format(knn.data.shape[0]), *knn.data)) # Then write the data for ix in range(n_cells): sample = transformed[ix, :] data_file.write( pack('={}d'.format(sample.shape[0]), *sample)) # Call bh_tsne and let it do its thing with open(os.devnull, 'w') as dev_null: bh_tsne_p = Popen(("bh_tsne", ), cwd=td) bh_tsne_p.wait() if bh_tsne_p.returncode != 0: logging.error( "TSNE layout failed to execute external binary 'bh_tsne' (check $PATH)" ) raise RuntimeError() # Read and pass on the results with open(os.path.join(td, 'result.dat'), 'rb') as output_file: # The first two integers are just the number of samples and the # dimensionality _, n_dims = _read_unpack('ii', output_file) # Collect the results, but they may be out of order results = [ _read_unpack('{}d'.format(n_dims), output_file) for _ in range(n_cells) ] # Now collect the landmark data so that we can return the data in # the order it arrived results = [(_read_unpack('i', output_file), e) for e in results] # Put the results in order and yield it results.sort() xy = [result for _, result in results] return np.array(xy)