示例#1
0
def from_csv_yaml(
    csv_input: str,
    marker_yaml: str,
    design_csv: str = None,
    random_seed: int = 1234,
    dtype: torch.dtype = torch.float64,
) -> Any:
    """Create an Astir object from an expression CSV and marker YAML

    :param csv_input: Path to input csv containing expression for cells (rows) by proteins (columns). First column is
        cell identifier, and additional column names are gene identifiers.
    :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
        entries. See documention.
    :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell
        identifier, and additional column names are covariate identifiers.
    :param random_seed: The random seed to be used to initialize variables,
        defaults to 1234
    :param dtype: datatype of the model parameters, defaults to torch.float64
    """
    df_gex = pd.read_csv(csv_input, index_col=0)

    design = None
    if design_csv is not None:
        design = pd.read_csv(design_csv, index_col=0)
    with open(marker_yaml, "r") as stream:
        marker_dict = yaml.safe_load(stream)
    from astir.astir import Astir

    return Astir(df_gex, marker_dict, design, random_seed, dtype=dtype)
示例#2
0
def from_anndata_yaml(
    anndata_file: str,
    marker_yaml: str,
    protein_name: str = None,
    cell_name: str = None,
    batch_name: str = "batch",
    random_seed: int = 1234,
    dtype: torch.dtype = torch.float64,
) -> Any:
    """Create an Astir object from an :class:`anndata.Anndata` file and a
        marker yaml

    :param anndata_file: Path to an :class:`anndata.Anndata` `h5py` file
    :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
        entries. See documention.
    :param protein_name: The column of `adata.var` containing protein names. If this is none, defaults to `adata.var_names`
    :param cell_name:  The column of `adata.obs` containing cell names. If this is none, defaults to `adata.obs_names`
    :param batch_name: The column of `adata.obs` containing batch names. A design matrix
        will be built using this (if present) using a one-hot encoding to
        control for batch, defaults to 'batch'
    :param random_seed: The random seed to be used to initialize variables,
        defaults to 1234
    :param dtype: datatype of the model parameters, defaults to torch.float64
    :return: An object of class `astir_bash.py.Astir` using data imported from the loom files
    """
    # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved
    batch_list = None

    ad = anndata.read_h5ad(anndata_file)

    df_gex = pd.DataFrame(ad.X)

    if protein_name is not None:
        df_gex.columns = ad.var[protein_name]
    else:
        df_gex.columns = ad.var_names

    if cell_name is not None:
        df_gex.index = ad.obs[cell_name]
    else:
        df_gex.index = ad.obs_names

    if batch_name is not None:
        batch_list = ad.obs[batch_name]

    design = None

    if batch_list is not None:
        design = (OneHotEncoder().fit_transform(batch_list.to_numpy().reshape(
            -1, 1)).todense())
        design = design[:, :-1]  # remove final column
        design = np.concatenate([np.ones((design.shape[0], 1)), design],
                                axis=1)

    with open(marker_yaml, "r") as stream:
        marker_dict = yaml.safe_load(stream)
    from astir.astir import Astir

    return Astir(df_gex, marker_dict, design, random_seed, dtype)
示例#3
0
def from_loompy_yaml(
    loom_file: str,
    marker_yaml: str,
    protein_name_attr: str = "protein",
    cell_name_attr: str = "cell_name",
    batch_name_attr: str = "batch",
    create_design_mat: bool = True,
    random_seed: int = 1234,
    dtype: torch.dtype = torch.float64,
) -> Any:
    """Create an Astir object from a loom file and a marker yaml

    :param loom_file: Path to a loom file, where rows correspond to proteins and columns to cells
    :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
        entries. See documention.
    :param protein_name_attr: The attribute (key) in the row attributes that identifies the protein names
        (required to match with the marker gene information), defaults to
        protein
    :param cell_name_attr: The attribute (key) in the column attributes that
        identifies the name of each cell, defaults to cell_name
    :param batch_name_attr: The attribute (key) in the column attributes that identifies the batch. A design matrix
        will be built using this (if present) using a one-hot encoding to
        control for batch, defaults to batch
    :param create_design_mat: Determines whether a design matrix is created. Defaults to True.
    :param random_seed: The random seed to be used to initialize variables,
        defaults to 1234
    :param dtype: datatype of the model parameters, defaults to torch.float64
    :return: An object of class `astir_bash.py.Astir` using data imported from the loom files
    """
    # TODO: This function is memory inefficient and goes against the philosophy of loom files. Should be improved
    batch_list = None
    with loompy.connect(loom_file) as ds:
        df_gex = pd.DataFrame(ds[:, :].T)
        df_gex.columns = ds.ra[protein_name_attr]

        if cell_name_attr in ds.ca.keys():
            df_gex.index = ds.ca[cell_name_attr]

        if batch_name_attr in ds.ca.keys():
            batch_list = ds.ca[batch_name_attr]

    design = None

    if batch_list is not None and create_design_mat == True:
        design = OneHotEncoder().fit_transform(batch_list.reshape(
            -1, 1)).todense()
        design = design[:, :-1]  # remove final column
        design = np.concatenate([np.ones((design.shape[0], 1)), design],
                                axis=1)

    with open(marker_yaml, "r") as stream:
        marker_dict = yaml.safe_load(stream)
    from astir.astir import Astir

    return Astir(df_gex, marker_dict, design, random_seed, dtype)
示例#4
0
def from_csv_dir_yaml(
    input_dir: str,
    marker_yaml: str,
    create_design_mat: bool = True,
    random_seed: int = 1234,
    dtype: torch.dtype = torch.float64,
) -> Any:
    """Create an Astir object a directory containing multiple csv files

    :param input_dir: Path to a directory containing multiple CSV files, each in the format expected by
        `from_csv_yaml`
    :param marker_yaml: Path to input YAML file containing marker gene information. Should include cell_type and cell_state
        entries. See documention.
    :param design_csv: Path to design matrix as a CSV. Rows should be cells, and columns covariates. First column is cell
        identifier, and additional column names are covariate identifiers
    :param create_design_mat: Determines whether a design matrix is created. Defaults to True.
    :param random_seed: The random seed to be used to initialize variables,
        defaults to 1234
    :param dtype: datatype of the model parameters, defaults to torch.float64
    """
    # TODO: add text explaining concatenation
    # Parse the input directory
    csv_files = [
        os.path.join(input_dir, f) for f in os.listdir(input_dir)
        if f.endswith("csv")
    ]

    # Read to gene expression df and parse
    dfs = [pd.read_csv(f, index_col=0) for f in csv_files]
    df_gex = pd.concat(dfs, axis=0)

    design = None
    if create_design_mat == True:
        # Construct a sample specific design matrix
        design_list = [
            np.repeat(str(i), dfs[i].shape[0]) for i in range(len(dfs))
        ]
        design = (OneHotEncoder().fit_transform(
            np.concatenate(design_list, axis=0).reshape(-1, 1)).todense())
        design = design[:, :-1]  # remove final column
        design = np.concatenate([np.ones((design.shape[0], 1)), design],
                                axis=1)  # add in intercept!

    with open(marker_yaml, "r") as stream:
        marker_dict = yaml.safe_load(stream)
    from astir.astir import Astir

    return Astir(df_gex, marker_dict, design, random_seed, dtype)
示例#5
0
    def test_command_all_flags(self):
        warnings.filterwarnings("ignore", category=UserWarning)
        (
            design,
            max_epochs,
            lr,
            batch_size,
            random_seed,
            n_init,
            n_init_epochs,
            dtype,
            delta_loss,
            delta_loss_batch,
        ) = (None, 2, 1e-1, 128, 1234, 1, 1, torch.float64, 1e-3, 10)
        bash_command = "python -W ignore {} {} {} {} {}".format(
            self.exec_path,
            "state",
            self.expr_csv_file,
            self.marker_yaml_file,
            self.output_file,
        )
        bash_command += " --design {}".format(design)
        bash_command += " --max_epochs {}".format(max_epochs)
        bash_command += " --learning_rate {}".format(lr)
        bash_command += " --batch_size {}".format(batch_size)
        bash_command += " --random_seed {}".format(random_seed)
        bash_command += " --n_init {}".format(n_init)
        bash_command += " --n_init_epochs {}".format(n_init_epochs)
        bash_command += " --dtype {}".format(dtype)
        bash_command += " --delta_loss {}".format(delta_loss)
        bash_command += " --delta_loss_batch {}".format(delta_loss_batch)
        process = subprocess.Popen(bash_command.split(),
                                   stdout=subprocess.PIPE)
        output, error = process.communicate()
        self.assertIsNone(error)

        # Create Astir object to compare
        ast = Astir(
            input_expr=self.expr,
            marker_dict=self.marker_dict,
            design=design,
            random_seed=random_seed,
            dtype=dtype,
        )

        ast.fit_state(
            max_epochs=max_epochs,
            learning_rate=lr,
            batch_size=batch_size,
            delta_loss=delta_loss,
            n_init=n_init,
            n_init_epochs=n_init_epochs,
            delta_loss_batch=delta_loss_batch,
        )

        expected_assign = ast.get_cellstates()
        actual_assign = pd.read_csv(self.output_file, index_col=0)
        self.assertEqual(len(expected_assign), len(actual_assign))
        self.assertTrue(
            (expected_assign.columns == actual_assign.columns).all())

        self.assertTrue(
            (abs(actual_assign.to_numpy() - expected_assign.to_numpy()) <
             0.01).all())