def __setattr__(self, name: str, val: Any) -> None: if name.startswith("!"): super(GlobalAttributeManager, self).__setattr__(name[1:], val) elif "/" in name: raise KeyError("Attribute name cannot contain slash (/)") else: if self.f is not None: if loompy.compare_loom_spec_version(self.f, "3.0.0") < 0 and "attrs" not in self.f["/"]: normalized = loompy.normalize_attr_values(val, False) self.f.attrs[name] = normalized self.f.flush() val = self.f.attrs[name] # Read it back in to ensure it's synced and normalized normalized = loompy.materialize_attr_values(val) self.__dict__["storage"][name] = normalized else: normalized = loompy.normalize_attr_values(val, True) if name in self.f["attrs"]: del self.f["attrs"][name] if not np.isscalar(normalized) and normalized.dtype == np.object_: self.ds._file.create_dataset("attrs/" + name, data=normalized, dtype=h5py.special_dtype(vlen=str)) else: self.f["attrs"][name] = normalized self.f.flush() val = self.f["attrs"][name][()] # Read it back in to ensure it's synced and normalized normalized = loompy.materialize_attr_values(val) self.__dict__["storage"][name] = normalized
def __setattr__(self, name: str, val: np.ndarray) -> None: """ Set the value of a named attribute Args: name (str) Name of the attribute val (np.ndarray) Value of the attribute Remarks: Length must match the corresponding matrix dimension The values are automatically HMTL escaped and converted to ASCII for storage """ if name.startswith("!"): super(AttributeManager, self).__setattr__(name[1:], val) else: if self.ds is not None: values = loompy.normalize_attr_values(val) a = ["/row_attrs/", "/col_attrs/"][self.axis] if self.ds.shape[self.axis] != 0 and values.shape[0] != self.ds.shape[self.axis]: raise ValueError(f"Attribute must have exactly {self.ds.shape[self.axis]} values but {len(values)} were given") if self.ds._file[a].__contains__(name): del self.ds._file[a + name] self.ds._file[a + name] = values # TODO: for 2D arrays, use block compression along columns/rows self.ds._file.flush() self.__dict__["storage"][name] = loompy.materialize_attr_values(self.ds._file[a][name][:]) else: self.__dict__["storage"][name] = val
def __setattr__(self, name: str, val: np.ndarray) -> None: """ Set the value of a named attribute Args: name (str) Name of the attribute val (np.ndarray) Value of the attribute Remarks: Length must match the corresponding matrix dimension The values are automatically HMTL escaped and converted to ASCII for storage """ if name.startswith("!"): super(AttributeManager, self).__setattr__(name[1:], val) elif "/" in name: raise KeyError("Attribute name cannot contain slash (/)") else: if self.ds is not None: values = loompy.normalize_attr_values( val, compare_loom_spec_version(self.ds._file, "3.0.0") >= 0) a = ["/row_attrs/", "/col_attrs/"][self.axis] if self.ds.shape[self.axis] != 0 and values.shape[ 0] != self.ds.shape[self.axis]: raise ValueError( f"Attribute '{name}' must have exactly {self.ds.shape[self.axis]} values but {len(values)} were given" ) if self.ds._file[a].__contains__(name): del self.ds._file[a + name] if isinstance(self.ds._file, h5py.File): self.ds._file.create_dataset( a + name, data=values, dtype=h5py.special_dtype(vlen=str) if values.dtype == np.object_ else values.dtype, maxshape=(values.shape[0], ) if len(values.shape) == 1 else (values.shape[0], None), fletcher32=False, compression="gzip", shuffle=False, compression_opts=2) else: self.ds._file.create_dataset( a + name, data=values.astype(np.string_) if values.dtype == np.object_ else values) self.ds._file[a + name].attrs["last_modified"] = timestamp() self.ds._file[a].attrs["last_modified"] = timestamp() self.ds._file.attrs["last_modified"] = timestamp() if isinstance(self.ds._file, h5py.File): self.ds._file.flush() self.__dict__["storage"][ name] = loompy.materialize_attr_values( self.ds._file[a][name][:]) else: self.__dict__["storage"][name] = val
def __setattr__(self, name: str, val: Any) -> None: if name.startswith("!"): super(FileAttributeManager, self).__setattr__(name[1:], val) else: if self.f is not None: normalized = loompy.normalize_attr_values(val) self.f.attrs[name] = normalized self.f.flush() val = self.f.attrs[name] # Read it back in to ensure it's synced and normalized normalized = loompy.materialize_attr_values(val) self.__dict__["storage"][name] = normalized
def _to_loom(self): """Write a loom file from Redshift query manifests. Returns: output_path: Path to the new loom file. """ # Put loom on the output filename if it's not already there. if not self.local_output_filename.endswith(".loom"): self.local_output_filename += ".loom" # Read the row (gene) attributes and then set some conventional names gene_df = self.query_results[QueryType.FEATURE].load_results() gene_df["featurekey"] = gene_df.index gene_count = gene_df.shape[0] cell_count = self.query_results[ QueryType.CELL].manifest["record_count"] os.makedirs(self.working_dir, exist_ok=True) loom_path = os.path.join(self.working_dir, self.local_output_filename) loom_file = h5py.File(loom_path, mode="w") # Set some file attributes defined in the loom spec loom_file.attrs["CreationDate"] = self._loom_timestamp() loom_file.attrs["LOOM_SPEC_VERSION"] = "2.0.1" # Create the hdf5 dataset that will hold all the expression data matrix_dataset = loom_file.create_dataset("matrix", shape=(gene_count, cell_count), dtype="float32", compression="gzip", compression_opts=2, chunks=(gene_count, 1)) cellkeys = [] cell_counter = 0 # Iterate through the cells. For each set of cells reshape the # dataframe so genes are row and cells are columns. Stick that data # into the expression dataset. for cells_df in self._generate_expression_dfs(50): pivoted = cells_df.pivot( index="featurekey", columns="cellkey", values="exprvalue").reindex(index=gene_df.index).fillna(0.0) cellkeys.extend(pivoted.columns.to_list()) matrix_dataset[:, cell_counter:cell_counter + pivoted.shape[1]] = pivoted cell_counter += pivoted.shape[1] matrix_dataset.attrs["last_modified"] = self._loom_timestamp() # Now write the metadata into different datasets according to the loom # spec. cell_df = self.query_results[QueryType.CELL].load_results().reindex( index=cellkeys) col_attrs_group = loom_file.create_group("col_attrs") cell_id_dset = col_attrs_group.create_dataset( "CellID", data=loompy.normalize_attr_values(cell_df.index.to_numpy()), compression='gzip', compression_opts=2, chunks=(min(256, cell_count), )) cell_id_dset.attrs["last_modified"] = self._loom_timestamp() for cell_metadata_field in cell_df: cell_metadata = cell_df[cell_metadata_field] dset = col_attrs_group.create_dataset( cell_metadata_field, data=loompy.normalize_attr_values(cell_metadata.to_numpy()), compression='gzip', compression_opts=2, chunks=(min(256, cell_count), )) dset.attrs["last_modified"] = self._loom_timestamp() col_attrs_group.attrs["last_modified"] = self._loom_timestamp() row_attrs_group = loom_file.create_group("row_attrs") acc_dset = row_attrs_group.create_dataset( "Accession", data=loompy.normalize_attr_values(gene_df.index.to_numpy()), compression='gzip', compression_opts=2, chunks=(min(256, gene_count), )) acc_dset.attrs["last_modified"] = self._loom_timestamp() name_dset = row_attrs_group.create_dataset( "Gene", data=loompy.normalize_attr_values( gene_df["featurename"].to_numpy()), compression='gzip', compression_opts=2, chunks=(min(256, gene_count), )) name_dset.attrs["last_modified"] = self._loom_timestamp() for gene_metadata_field in gene_df: if gene_metadata_field == "featurename": continue gene_metadata = gene_df[gene_metadata_field] dset = row_attrs_group.create_dataset( gene_metadata_field, data=loompy.normalize_attr_values(gene_metadata.to_numpy()), compression='gzip', compression_opts=2, chunks=(min(256, gene_count), )) dset.attrs["last_modified"] = self._loom_timestamp() row_attrs_group.attrs["last_modified"] = self._loom_timestamp() # These two groups are defined in the spec, but matrix service outputs # don't use them. loom_file.create_group("layers") loom_file.create_group("row_graphs") loom_file.create_group("col_graphs") loom_file.attrs["last_modified"] = self._loom_timestamp() return loom_path