def from_csv_yaml( csv_input: str, marker_yaml: str, design_csv: str = None, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from an expression CSV and marker YAML :param csv_input: Path to input csv containing expression for cells (rows) by proteins (columns). First column is cell identifier, and additional column names are gene identifiers. :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 """ df_gex = pd.read_csv(csv_input, index_col=0) design = None if design_csv is not None: design = pd.read_csv(design_csv, index_col=0) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype=dtype)
def from_anndata_yaml( anndata_file: str, marker_yaml: str, protein_name: str = None, cell_name: str = None, batch_name: str = "batch", random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from an :class:`anndata.Anndata` file and a marker yaml :param anndata_file: Path to an :class:`anndata.Anndata` `h5py` file :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name: The column of `adata.var` containing protein names. If this is none, defaults to `adata.var_names` :param cell_name: The column of `adata.obs` containing cell names. If this is none, defaults to `adata.obs_names` :param batch_name: The column of `adata.obs` containing batch names. A design matrix will be built using this (if present) using a one-hot encoding to control for batch, defaults to 'batch' :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 :return: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None ad = anndata.read_h5ad(anndata_file) df_gex = pd.DataFrame(ad.X) if protein_name is not None: df_gex.columns = ad.var[protein_name] else: df_gex.columns = ad.var_names if cell_name is not None: df_gex.index = ad.obs[cell_name] else: df_gex.index = ad.obs_names if batch_name is not None: batch_list = ad.obs[batch_name] design = None if batch_list is not None: design = (OneHotEncoder().fit_transform(batch_list.to_numpy().reshape( -1, 1)).todense()) design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
def from_loompy_yaml( loom_file: str, marker_yaml: str, protein_name_attr: str = "protein", cell_name_attr: str = "cell_name", batch_name_attr: str = "batch", create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object from a loom file and a marker yaml :param loom_file: Path to a loom file, where rows correspond to proteins and columns to cells :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param protein_name_attr: The attribute (key) in the row attributes that identifies the protein names (required to match with the marker gene information), defaults to protein :param cell_name_attr: The attribute (key) in the column attributes that identifies the name of each cell, defaults to cell_name :param batch_name_attr: The attribute (key) in the column attributes that identifies the batch. A design matrix will be built using this (if present) using a one-hot encoding to control for batch, defaults to batch :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 :return: An object of class `astir_bash.py.Astir` using data imported from the loom files """ # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved batch_list = None with loompy.connect(loom_file) as ds: df_gex = pd.DataFrame(ds[:, :].T) df_gex.columns = ds.ra[protein_name_attr] if cell_name_attr in ds.ca.keys(): df_gex.index = ds.ca[cell_name_attr] if batch_name_attr in ds.ca.keys(): batch_list = ds.ca[batch_name_attr] design = None if batch_list is not None and create_design_mat == True: design = OneHotEncoder().fit_transform(batch_list.reshape( -1, 1)).todense() design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
def from_csv_dir_yaml( input_dir: str, marker_yaml: str, create_design_mat: bool = True, random_seed: int = 1234, dtype: torch.dtype = torch.float64, ) -> Any: """Create an Astir object a directory containing multiple csv files :param input_dir: Path to a directory containing multiple CSV files, each in the format expected by `from_csv_yaml` :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state entries. See documention. :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell identifier, and additional column names are covariate identifiers :param create_design_mat: Determines whether a design matrix is created. Defaults to True. :param random_seed: The random seed to be used to initialize variables, defaults to 1234 :param dtype: datatype of the model parameters, defaults to torch.float64 """ # TODO: add text explaining concatenation # Parse the input directory csv_files = [ os.path.join(input_dir, f) for f in os.listdir(input_dir) if f.endswith("csv") ] # Read to gene expression df and parse dfs = [pd.read_csv(f, index_col=0) for f in csv_files] df_gex = pd.concat(dfs, axis=0) design = None if create_design_mat == True: # Construct a sample specific design matrix design_list = [ np.repeat(str(i), dfs[i].shape[0]) for i in range(len(dfs)) ] design = (OneHotEncoder().fit_transform( np.concatenate(design_list, axis=0).reshape(-1, 1)).todense()) design = design[:, :-1] # remove final column design = np.concatenate([np.ones((design.shape[0], 1)), design], axis=1) # add in intercept! with open(marker_yaml, "r") as stream: marker_dict = yaml.safe_load(stream) from astir.astir import Astir return Astir(df_gex, marker_dict, design, random_seed, dtype)
def test_command_all_flags(self): warnings.filterwarnings("ignore", category=UserWarning) ( design, max_epochs, lr, batch_size, random_seed, n_init, n_init_epochs, dtype, delta_loss, delta_loss_batch, ) = (None, 2, 1e-1, 128, 1234, 1, 1, torch.float64, 1e-3, 10) bash_command = "python -W ignore {} {} {} {} {}".format( self.exec_path, "state", self.expr_csv_file, self.marker_yaml_file, self.output_file, ) bash_command += " --design {}".format(design) bash_command += " --max_epochs {}".format(max_epochs) bash_command += " --learning_rate {}".format(lr) bash_command += " --batch_size {}".format(batch_size) bash_command += " --random_seed {}".format(random_seed) bash_command += " --n_init {}".format(n_init) bash_command += " --n_init_epochs {}".format(n_init_epochs) bash_command += " --dtype {}".format(dtype) bash_command += " --delta_loss {}".format(delta_loss) bash_command += " --delta_loss_batch {}".format(delta_loss_batch) process = subprocess.Popen(bash_command.split(), stdout=subprocess.PIPE) output, error = process.communicate() self.assertIsNone(error) # Create Astir object to compare ast = Astir( input_expr=self.expr, marker_dict=self.marker_dict, design=design, random_seed=random_seed, dtype=dtype, ) ast.fit_state( max_epochs=max_epochs, learning_rate=lr, batch_size=batch_size, delta_loss=delta_loss, n_init=n_init, n_init_epochs=n_init_epochs, delta_loss_batch=delta_loss_batch, ) expected_assign = ast.get_cellstates() actual_assign = pd.read_csv(self.output_file, index_col=0) self.assertEqual(len(expected_assign), len(actual_assign)) self.assertTrue( (expected_assign.columns == actual_assign.columns).all()) self.assertTrue( (abs(actual_assign.to_numpy() - expected_assign.to_numpy()) < 0.01).all())