示例#1
0
    def test_normalization(self):
        ds = get_dataset('8kmy')
        # ignore overflow warning
        with catch_warnings_ignore(RuntimeWarning):
            ds1 = ds.expm1(omic=OMIC.transcriptomic, inplace=False)
            ds2 = ds.expm1(omic=OMIC.proteomic, inplace=False)
            self.assertTrue(np.all(np.expm1(ds.X) == ds1.X))
            self.assertTrue(
                np.all(
                    np.expm1(ds.numpy(OMIC.proteomic)) == ds2.numpy(
                        OMIC.proteomic)))

        ds1 = ds.normalize(OMIC.transcriptomic,
                           inplace=False,
                           log1p=True,
                           scale=False,
                           total=False)
        ds2 = ds.normalize(OMIC.proteomic,
                           inplace=False,
                           log1p=True,
                           scale=False,
                           total=False)
        self.assertTrue(
            np.all(ds1.numpy(OMIC.transcriptomic) == np.log1p(ds.X)))
        self.assertTrue(
            np.all(ds1.numpy(OMIC.proteomic) == ds.numpy(OMIC.proteomic)))
        self.assertTrue(
            np.all(
                ds2.numpy(OMIC.proteomic) == np.log1p(ds.numpy(
                    OMIC.proteomic))))
        self.assertTrue(
            np.all(
                ds2.numpy(OMIC.transcriptomic) == ds.numpy(
                    OMIC.transcriptomic)))
示例#2
0
    def test_metrics(self):
        sco = get_dataset('8kmy')
        with catch_warnings_ignore(ConvergenceWarning):
            sco.rank_vars_groups(clustering='kmeans')
            sco.calculate_quality_metrics()
            with sco._swap_omic('prot'):
                sco.rank_vars_groups(clustering='kmeans')
                sco.calculate_quality_metrics()

            if _SCVI:
                sco = get_dataset('cortex')
                sco.rank_vars_groups(clustering='kmeans')
                sco.calculate_quality_metrics()
                with sco._swap_omic('cell'):
                    sco.rank_vars_groups(clustering='kmeans')
                    sco.calculate_quality_metrics()
  def test_variational_model(self):
    sco = get_dataset(_DS)
    n_genes = sco.n_vars
    n_prots = sco.numpy(OMIC.proteomic).shape[1]
    vae = VariationalAutoEncoder(outputs=[
        RandomVariable(dim=n_genes, posterior='zinb', name=OMIC.transcriptomic),
        RandomVariable(dim=n_prots, posterior='nbd', name=OMIC.proteomic)
    ])
    vae.fit(sco, epochs=_EPOCHS, verbose=False)
    self._loss_not_rise(vae.train_history['loss'])
    self._loss_not_rise(vae.valid_history['val_loss'])

    X = sco.numpy()[:128]
    (pX, pY), qZ = vae.predict(X, sample_shape=2, verbose=False)

    self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated))
    self.assertTrue(
        isinstance(pX.distribution.count_distribution,
                   bay.distributions.NegativeBinomial))
    self.assertTrue(
        isinstance(pY.distribution, bay.distributions.NegativeBinomialDisp))
    self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0])
    self.assertTrue(pY.batch_shape[0] == 2 and pY.batch_shape[1] == X.shape[0])
    self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag))
    self.assertTrue(qZ.sample().shape == (X.shape[0],
                                          vae.latents[0].event_shape[0]))
示例#4
0
 def create_posterior(self,
                      test_sco: SingleCellOMIC = None,
                      dropout_rate=0.2,
                      retain_rate=0.2,
                      corrupt_distribution='binomial',
                      batch_size=8,
                      sample_shape=10,
                      reduce_latents=partial(tf.concat, axis=1),
                      verbose=True,
                      train_percent=0.8,
                      random_state=1) -> Posterior:
   r""" Create a `Posterior` object for evaluation """
   if not self.is_fitted:
     raise RuntimeError("fit() must be called before creating Posterior.")
   ###
   if isinstance(test_sco, SingleCellOMIC):
     test = test_sco
   elif self.dataset is None:
     raise ValueError(
         "Call SingleCellModel.set_metadata() to track the fitted dataset.")
   else:
     ds = get_dataset(self.dataset)
     _, test = ds.split(train_percent=train_percent, seed=random_state)
   ###
   return Posterior(scm=self,
                    sco=test,
                    dropout_rate=dropout_rate,
                    retain_rate=retain_rate,
                    corrupt_distribution=corrupt_distribution,
                    batch_size=batch_size,
                    sample_shape=sample_shape,
                    reduce_latents=reduce_latents,
                    verbose=verbose,
                    name=f"{self.id}_{self.dataset}",
                    random_state=random_state)
示例#5
0
 def test_visualization_celltype(self):
     sco = get_dataset('cortex')
     for X, var_names, rank_genes, clustering, dendrogram in itertools.product(
         ('cell', 'tran'), \
         (None, 10),
         (0, 3),
         ('kmeans', 'louvain', None),
         (True, False)):
         if X == 'cell' and rank_genes > 0:
             continue
         # check louvain available
         if clustering == 'louvain':
             try:
                 import louvain
             except ImportError:
                 continue
         # plotting
         with catch_warnings_ignore(ignore_warnings):
             sco.plot_heatmap(X=X,
                              groupby=OMIC.celltype,
                              var_names=var_names,
                              clustering=clustering,
                              rank_genes=rank_genes)
             sco.plot_dotplot(X=X,
                              groupby=OMIC.celltype,
                              var_names=var_names,
                              clustering=clustering,
                              rank_genes=rank_genes)
             sco.plot_stacked_violins(X=X,
                                      groupby=OMIC.celltype,
                                      var_names=var_names,
                                      clustering=clustering,
                                      rank_genes=rank_genes)
     sco.save_figures('/tmp/tmp2.pdf')
示例#6
0
 def test_filters(self):
     ds = get_dataset('8kmy')
     ds1 = ds.filter_highly_variable_genes(inplace=False)
     ds2 = ds.filter_genes(inplace=False, min_counts=100)
     ds3 = ds.filter_cells(inplace=False, min_counts=1000)
     self.assertTrue(ds1.shape[1] == 999)
     self.assertTrue(np.min(ds2.X.sum(0)) == 100)
     self.assertTrue(np.min(ds3.X.sum(1)) == 1000)
示例#7
0
 def on_load_data(self, cfg):
     ds = cfg.dataset
     sco = get_dataset(ds.name)
     if cfg.verbose:
         print(sco)
     train, test = sco.split(train_percent=ds.train_percent)
     self.sco = sco
     self.train = train
     self.test = test
示例#8
0
    def preprocess(self):
        ds, gene_ds, prot_ds = get_dataset(dataset_name="pbmc_citeseq",
                                           override=False)
        expression_data = gene_ds.X
        gene_symbols = gene_ds.X_col
        self.gene_symbols = gene_symbols
        self.cell_names = gene_ds.X_row

        self.adt_expression = prot_ds.X
        self.protein_markers = prot_ds.X_col

        assert np.all(gene_ds.X_row == prot_ds.X_row)

        return expression_data
示例#9
0
def get_arguments():
    args = ArgController().add(
        "input", "Name of the dataset or path to csv file").add(
            "-n", "number of GMM components",
            2).add("-idx", "index of the positive component", 1).add(
                "-norm", "method for normalizing: raw, log", 'log',
                ('log', 'raw')).add(
                    "-outpath", "y_bin and y_prob will be saved to this path",
                    '').add("-figpath", "path for saving analysis figure",
                            '/tmp/tmp.pdf').add(
                                "--verbose",
                                "Enable verbose and saving diagnosis",
                                False).parse()
    inp = str(args.input)
    if os.path.exists(inp):
        assert os.path.isfile(inp), "%s must be path to a file" % inp
        data = []
        with open(inp, 'r') as f:
            for line in f:
                data.append(line.strip().split(','))
        data = np.array(data)
        if all(is_number(i, string_number=True) for i in data[0]):
            y_prot = data.astype('float32')
            y_prot_names = np.array(
                ['#%d' % i for i in range(y_prot.shape[1])])
        else:
            y_prot = data[1:].astype('float32')
            y_prot_names = data[0]
        outpath = args.outpath
    else:
        from sisua.data import get_dataset
        ds, gene_ds, prot_ds = get_dataset(inp, override=False)
        y_prot = ds['y']
        y_prot_names = np.array(ds['y_col'])
        outpath = ds.path if args.outpath == '' else args.outpath
    return {
        'y_prot': y_prot,
        'y_prot_names': y_prot_names,
        'n_components': int(args.n),
        'index': int(args.idx),
        'log_norm': True if args.norm == 'log' else False,
        'outpath': outpath if len(outpath) > 0 else None,
        'figpath': args.figpath if len(args.figpath) > 0 else None,
        'verbose': bool(args.verbose)
    }
示例#10
0
 def test_corruption(self):
     ds = get_dataset('8kmy')
     ds1 = ds.corrupt(dropout_rate=0.25, inplace=False)
     ds2 = ds.corrupt(dropout_rate=0.5, inplace=False)
     ds3 = ds.corrupt(dropout_rate=0.5, inplace=False, omic=OMIC.proteomic)
     ds4 = ds.corrupt(dropout_rate=0.5,
                      inplace=False,
                      omic=OMIC.proteomic,
                      distribution='uniform')
     self.assertTrue(ds.sparsity() < ds1.sparsity() < ds2.sparsity())
     om = OMIC.proteomic
     self.assertTrue(ds.sparsity(om) < ds3.sparsity(om) < ds4.sparsity(om))
     # multi corruption
     ds1 = ds.corrupt(omic=OMIC.transcriptomic | OMIC.proteomic,
                      dropout_rate=0.5,
                      inplace=False)
     self.assertTrue(\
       ds1.sparsity(OMIC.transcriptomic) > ds.sparsity(OMIC.transcriptomic) and
       ds1.sparsity(OMIC.proteomic) > ds.sparsity(OMIC.proteomic))
示例#11
0
 def test_basic_functionalities(self):
     ds = get_dataset('8kmy')
     # split
     train, test = ds.split()
     self.assertEqual(
         set(train.cell_id) | set(test.cell_id), set(ds.cell_id))
     # copy
     copy1 = ds.copy()  # copy backed dataset
     copy2 = train.copy()  # copy view dataset
     copy3 = ds.copy().apply_indices(test.indices)
     _equal(self, copy1, ds)
     _equal(self, copy2, train)
     _equal(self, copy3, test)
     # split again
     train1, test1 = ds.split()
     train.assert_matching_cells(train1)
     test.assert_matching_cells(test1)
     _equal(self, train, train1)
     _equal(self, test, test1)
示例#12
0
 def prepare(self):
     with catch_warnings_ignore(RuntimeWarning):
         sco = get_dataset('cortex')
         om1, om2 = sco.omics
         train, test = sco.split(train_percent=0.8, seed=1)
         n_gene = sco.numpy(om1).shape[1]
         n_prot = sco.numpy(om2).shape[1]
         rvs = [
             RandomVariable(n_gene, 'zinbd', om1.name),
             RandomVariable(n_prot, 'onehot', om2.name)
         ]
         all_models = [
             DeepCountAutoencoder, SCALE, SCVI, VariationalAutoEncoder
         ]
         all_configs = [
             NetworkConfig(),
             NetworkConfig(pyramid=True),
             NetworkConfig(use_conv=True),
             NetworkConfig(pyramid=True, use_conv=True)
         ]
         return train, test, rvs, all_models, all_configs
示例#13
0
    def test_embedding(self):
        ds = get_dataset('8kmy')
        ds.probabilistic_embedding(OMIC.proteomic)
        prob = ds.probability()
        bina = ds.binary()
        self.assertTrue(np.all(np.logical_and(0. < prob, prob < 1.)))
        self.assertTrue(np.all(np.unique(bina) == np.unique([0., 1.])))

        for algo in ('pca', 'tsne'):
            n = ds.n_obs
            pca1 = ds.dimension_reduce(n_components=2, algo=algo)
            pca2 = ds.dimension_reduce(OMIC.proteomic,
                                       n_components=3,
                                       algo=algo)
            self.assertTrue(pca1.shape == (n, 2))
            self.assertTrue(pca2.shape == (n, 3) if algo == 'pca' else \
              pca2.shape == (n, 2))
            name1 = '%s_%s' % (OMIC.proteomic.name, algo)
            name2 = '%s_%s' % (OMIC.transcriptomic.name, algo)
            self.assertTrue(name1 in ds.obsm and name1 in ds.uns)
            self.assertTrue(name2 in ds.obsm and name2 in ds.uns)
  def test_scvi(self):
    sco = get_dataset(_DS)
    train, test = sco.split()
    scvi = SCVI(RandomVariable(sco.n_vars, posterior='zinbd', name='rna'))
    scvi.fit(train, epochs=_EPOCHS, verbose=False)
    pX, (qZ, qL) = scvi.predict(test, verbose=False)

    self._loss_not_rise(scvi.train_history['loss'])
    self._loss_not_rise(scvi.valid_history['val_loss'])

    self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated))
    self.assertTrue(
        isinstance(pX.distribution.count_distribution,
                   bay.distributions.NegativeBinomialDisp))
    self.assertTrue(pX.batch_shape[0] == 1 and pX.batch_shape[1] == test.n_obs)

    self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag))
    self.assertTrue(
        qZ.sample(1).shape == (1, test.n_obs, scvi.latents[0].event_shape[0]))

    self.assertTrue(isinstance(qL.distribution, bay.distributions.Normal))
    self.assertTrue(qL.sample(1).shape == (1, test.n_obs, 1))
  def test_unsupervised_fit_predict(self):
    sco = get_dataset(_DS)
    train, test = sco.split()
    self.assertTrue(sco.n_omics >= 2)
    dca = DeepCountAutoencoder(outputs=RandomVariable(dim=sco.n_vars,
                                                      posterior='mse'),
                               latent_dim=10)
    dca.fit(train, epochs=_EPOCHS, verbose=False)
    dca.fit(train.numpy(), epochs=_EPOCHS, verbose=False)
    self._loss_not_rise(dca.train_history['loss'])
    self._loss_not_rise(dca.valid_history['val_loss'])

    pX, qZ = dca.predict(test, sample_shape=2, verbose=False)
    self.assertTrue(isinstance(pX, bay.distributions.VectorDeterministic))
    self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == test.n_obs)
    self.assertTrue(isinstance(qZ, bay.distributions.VectorDeterministic))

    X = sco.numpy()[:128]
    pX, qZ = dca.predict(X, sample_shape=2, verbose=False)
    self.assertTrue(isinstance(pX, bay.distributions.VectorDeterministic))
    self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0])
    self.assertTrue(isinstance(qZ, bay.distributions.VectorDeterministic))
  def test_semi_supervised(self):
    sco = get_dataset(_DS)
    n_genes = sco.n_vars
    n_prots = sco.numpy(OMIC.proteomic).shape[1]
    sisua = SISUA(rna_dim=n_genes, adt_dim=n_prots, alternative_nb=True)
    sisua.fit(sco, epochs=_EPOCHS, verbose=False)
    self._loss_not_rise(sisua.train_history['loss'])
    self._loss_not_rise(sisua.valid_history['val_loss'])

    X = sco.numpy()[:128]
    (pX, pY), qZ = sisua.predict(X, sample_shape=2, verbose=False)

    self.assertTrue(isinstance(pX.distribution, bay.distributions.ZeroInflated))
    self.assertTrue(
        isinstance(pX.distribution.count_distribution,
                   bay.distributions.NegativeBinomialDisp))
    self.assertTrue(
        isinstance(pY.distribution, bay.distributions.NegativeBinomialDisp))
    self.assertTrue(pX.batch_shape[0] == 2 and pX.batch_shape[1] == X.shape[0])
    self.assertTrue(pY.batch_shape[0] == 2 and pY.batch_shape[1] == X.shape[0])
    self.assertTrue(isinstance(qZ, bay.distributions.MultivariateNormalDiag))
    self.assertTrue(
        qZ.sample(1).shape == (1, X.shape[0], sisua.latents[0].event_shape[0]))
示例#17
0

def extract_pca(p_train, p_test):
    # p_train, p_test : the output and latent distributions
    pca = [
        fast_pca(squeeze(train.mean()), squeeze(test.mean()),
                 n_components=2)[-1] for train, test in zip(p_train, p_test)
        if train.event_shape[0] > 1
    ]
    return pca


# ===========================================================================
# Load data
# ===========================================================================
sco = get_dataset('cortex')
train, test = sco.split(train_percent=0.8, seed=1)
n_gene = sco.numpy(OMIC.transcriptomic).shape[1]
n_prot = sco.numpy(OMIC.celltype).shape[1]

gene_rv = RVmeta(n_gene, 'zinb', 'rna')
prot_rv = RVmeta(n_prot, 'nb', 'adt')
latent_dim = 10
all_models = [SCALE, SCVI, DeepCountAutoencoder, VariationalAutoEncoder]
all_configs = [
    NetConf(),
    NetConf(pyramid=True),
    NetConf(use_conv=True),
    NetConf(pyramid=True, use_conv=True)
]
示例#18
0
def train_and_evaluate(ds_name, exp_name):
  from sisua.inference import InferenceSCVAE, InferenceSCVI, InferenceSISUA
  from sisua.analysis import Posterior, ResultsSheet

  ds, gene, prot = get_dataset(ds_name)

  # make sure gene expression stay the same
  assert np.all(gene.X_train == gene_eval.X_train) and \
  np.all(gene.X_test == gene_eval.X_test)

  print("\n======== Running experiment ========")
  print("Training %d-proteins:" % len(prot.col_name),
    ', '.join([standardize_protein_name(i) for i in prot.col_name]))
  print("Testing  %d-proteins:" % len(prot_eval.col_name),
    ', '.join([standardize_protein_name(i) for i in prot_eval.col_name]))

  n_prots = prot.feat_dim

  # ====== Main model training ====== #
  models = [
      InferenceSCVAE(gene_dim=n_genes),
      InferenceSCVI(gene_dim=n_genes),
      InferenceSISUA(gene_dim=n_genes, prot_dim=n_prots),
  ]
  for m in models:
    m.fit(X=gene.X_train,
          y=prot.X_train if m.is_semi_supervised else None,
          corruption_rate=corruption_rate, corruption_dist=corruption_dist,
          n_epoch=n_epoch, batch_size=batch_size,
          detail_logging=False)

  # ====== evaluation ====== #
  pos = [Posterior(m, ds=eval_ds)
         for m in models]

  res = ResultsSheet(pos, verbose=True)

  res.plot_learning_curves(
  ).save_plots(
      os.path.join(FIGURE_PATH, 'learning_curves_%s.pdf' % exp_name))

  res.plot_correlation_marker_pairs(
  ).save_plots(
      os.path.join(FIGURE_PATH, 'correlation8k_%s.pdf' % exp_name))

  res.plot_latents_binary_scatter(test=False
  ).plot_latents_binary_scatter(test=True
  ).save_plots(
      os.path.join(FIGURE_PATH, 'latent8k_%s.pdf' % exp_name))

  res.plot_scores(score_type='classifier'
  ).save_plots(
      os.path.join(FIGURE_PATH, 'classifier8k_%s.pdf' % exp_name))

  # ====== cross ds ====== #
  pos = [Posterior(m, ds=cross_ds)
         for m in models]

  res = ResultsSheet(pos, verbose=True)

  res.plot_correlation_marker_pairs(
  ).save_plots(
      os.path.join(FIGURE_PATH, 'correlationECC_%s.pdf' % exp_name))

  res.plot_latents_binary_scatter(test=False
  ).plot_latents_binary_scatter(test=True
  ).save_plots(
      os.path.join(FIGURE_PATH, 'latentECC_%s.pdf' % exp_name))

  res.plot_scores(score_type='classifier'
  ).save_plots(
      os.path.join(FIGURE_PATH, 'classifierECC_%s.pdf' % exp_name))
示例#19
0
from odin.utils import ArgController, stdio
from odin.utils.mpi import MPI
from sisua.analysis import Posterior
from sisua.data import get_dataset
from sisua.models.autoencoder import DeepCountAutoencoder
from sisua.models.scvi_models import SCVI
from sisua.models.semi_supervised import MultitaskAutoEncoder, multitaskVAE
from sisua.models.variational_autoencoder import VariationalAutoEncoder

# turn off TF logging and set reproducibile random seed
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.random.set_seed(8)
np.random.seed(8)

x, y = get_dataset('pbmc8kly')
x_train, x_test = x.split()
y_train, y_test = y.split()
x_train.assert_matching_cells(y_train)
x_test.assert_matching_cells(y_test)

flags = ArgController().add('--no-train', 'Stop training',
                            False).add('--no-score', 'Stop scoring',
                                       False).add('--analyze', "Analyzing",
                                                  False).parse()
no_train = flags.no_train
no_score = flags.no_score
analyze = flags.analyze
# assume the scores were ready when analyze is enable
if analyze:
  no_train = True
示例#20
0
# ====== save path for all trained models ====== #
path = '/tmp/pbmc8k_cellvdj'
if os.path.exists(path) and override:
    print("Overriding path: %s" % path)
    shutil.rmtree(path)

if not os.path.exists(path):
    os.mkdir(path)
if os.path.isfile(path):
    raise ValueError("'%s' must be folder path" % path)

# ===========================================================================
# Train on PBMC8k-ly
# ===========================================================================
x, y = get_dataset('pbmc8kly')
x_train, x_test = x.split()
y_train, y_test = y.split()
gene_name1 = x.var['geneid']

n_genes = x.shape[1]
n_prot = y.shape[1]

all_models = [
    DeepCountAutoencoder(units=n_genes),
    SCVI(units=n_genes),
    VariationalAutoEncoder(units=n_genes),
    MultitaskVAE(units=[n_genes, n_prot]),
    MultitaskVI(units=[n_genes, n_prot]),
]
示例#21
0
max_evals = 80
algorithm = 'bayes'
freq = 1000  # mean that only run on_train_end

path = '/tmp/autotune'
if os.path.exists(path):
    shutil.rmtree(path)
os.mkdir(path)

# sc_metrics more robust to NaN values
# TODO: accept a list of loss_name
stdio(os.path.join(path, 'fit_hyper.txt'))
# ===========================================================================
# Cortext
# ===========================================================================
x, y = get_dataset('cortex')
x.filter_cells(min_counts=1).filter_genes(min_counts=1)
gene = x.shape[1]
prot = y.shape[1]

SCVI.fit_hyper(x,
               loss_name='nllk0',
               model_kwargs=dict(units=gene, xdist='zinbd'),
               fit_kwargs=dict(epochs=epochs,
                               batch_size=batch_size,
                               callbacks=[NegativeLogLikelihood(freq=freq)]),
               max_evals=max_evals,
               save_path=os.path.join(path, 'scvi_cortex'),
               algorithm=algorithm,
               verbose=True)
示例#22
0
FIGURE_PATH = '/tmp/cross_datasets'

corruption_rate = 0.25
corruption_dist = 'binomial'

n_epoch = 1
batch_size = 128

if not os.path.exists(FIGURE_PATH):
    os.mkdir(FIGURE_PATH)

# ===========================================================================
# Load dataset
# ===========================================================================
all_datasets = {
    '8k': get_dataset('cross8k_ly'),
    'ecc': get_dataset('crossecc_ly')
}

# ====== check gene expression is matching ====== #
genes_name = None
all_proteins = None

for name, (ds, gene, prot) in all_datasets.items():
    if genes_name is None:
        genes_name = gene.col_name
    else:
        assert np.all(
            gene.col_name == genes_name), "Set of training genes mis-match"

    prots_name = set([standardize_protein_name(i) for i in prot.col_name])
示例#23
0
def filtering_experiment_path(ds_name,
                              incl_keywords,
                              excl_keywords,
                              fn_filter=None,
                              return_dataset=False,
                              print_log=False,
                              exp_path=''):
  r"""

  Parameters
  ----------
  ds_name : string
      direct path to experiments folder or name of the dataset

  incl_keywords : string
      list of keywords for including the experiments (connect by ',')

  excl_keywords : string
      list of keywords for excluding the experiments (connect by ',')

  exp_path : string
      optional, if not given, use SISUA_EXP

  Return
  ------
  dictionary
  corruption_config -> list of absolute path to all satisfied experiments

  Note
  ----
  only finished experiments are select, i.e. the experiment folder contain
  2 files 'config.pkl' and 'model.pkl'

  """
  from sisua.data import EXP_DIR, get_dataset
  ds_name = str(ds_name)
  if exp_path is None:
    exp_path = ''
  exp_path = str(exp_path)
  if len(exp_path) == 0:
    exp_path = EXP_DIR
  assert os.path.isdir(exp_path), exp_path
  # ====== check the keywords ====== #
  if incl_keywords is None:
    incl_keywords = []
  if excl_keywords is None:
    excl_keywords = []
  if fn_filter is None:
    fn_filter = lambda keywords: True
  # ====== get the exp path ====== #
  if ds_name is None or return_dataset:
    (ds, gene_ds, prot_ds) = get_dataset(ds_name)
    ds_name = ds.name
  exp_path = os.path.join(exp_path, ds_name)
  assert os.path.exists(exp_path), "Experiment path '%s' must exists" % exp_path
  # ====== Extract all experiments ====== #
  all_exp = []
  for name in os.listdir(exp_path):
    path = os.path.join(exp_path, name)
    # check if experiments finished
    if os.path.exists(os.path.join(path, 'model.pkl')):
      all_exp.append(path)
  all_exp = sorted(all_exp)
  # ====== start filtering ====== #
  if isinstance(incl_keywords, string_types):
    incl_keywords = [i for i in str(incl_keywords).split(',') if len(i) > 0]
  elif isinstance(incl_keywords, (tuple, list)):
    incl_keywords = as_tuple(incl_keywords, t=str)
  else:
    raise ValueError("No support for incl_keywords type: %s" %
                     str(type(incl_keywords)))

  if isinstance(excl_keywords, string_types):
    excl_keywords = [i for i in str(excl_keywords).split(',') if len(i) > 0]
  elif isinstance(excl_keywords, (tuple, list)):
    excl_keywords = as_tuple(excl_keywords, t=str)
  else:
    raise ValueError("No support for excl_keywords type: %s" %
                     str(type(excl_keywords)))

  all_exp = [
      i for i in all_exp if all(
          any(j in keyword
              for keyword in os.path.basename(i).split('_'))
          for j in incl_keywords)
  ]
  all_exp = [
      i for i in all_exp if all(
          all(j not in keyword
              for keyword in os.path.basename(i).split('_'))
          for j in excl_keywords)
  ]

  # filter function
  all_exp = [i for i in all_exp if fn_filter(os.path.basename(i).split('_'))]
  # ====== logging ====== #
  if bool(print_log):
    print(ctext("Found following experiments:", 'lightyellow'))
    for name, paths in all_exp.items():
      print("*", ctext(name, 'yellow'))
      for i in paths:
        print('  ', os.path.basename(i))

  if return_dataset:
    return all_exp, ds, gene_ds, prot_ds
  return all_exp
示例#24
0
def cross_analyze(datasets, outpath, models, nprocess=1, verbose=False):
    from sisua.data import get_dataset
    from sisua.data.path import EXP_DIR
    from sisua.data.utils import standardize_protein_name

    assert nprocess > 0, "Number of processes must be greater than 0"

    datasets = as_tuple(datasets, t=string_types)
    assert len(datasets) > 1, \
    "Require more than one datasets for cross analysis"
    if not os.path.exists(outpath):
        os.mkdir(outpath)

    models = as_tuple(models, t=string_types)
    assert len(models) > 0, \
    "At least one model must be given"

    # ====== load datasets ====== #
    global all_datasets
    all_datasets = {name: get_dataset(name)[0] for name in datasets}
    all_datasets = [
        (name,
         dict(
             X=ds['X'][:],
             X_col=ds['X_col'],
             X_row=ds['X_row'],
             y=ds['y'],
             y_col=np.array([standardize_protein_name(i)
                             for i in ds['y_col']]),
         )) for name, ds in all_datasets.items()
    ]

    # ====== check gene expression is matching ====== #
    genes = all_datasets[0][1]['X_col']
    for name, ds in all_datasets:
        assert np.all(ds['X_col'] == genes), "Set of training genes mis-match"

    # ====== get the list of all overlapping protein ====== #
    all_proteins = set(all_datasets[0][1]['y_col'])
    for name, ds in all_datasets:
        all_proteins &= set(ds['y_col'])
    all_proteins = sorted(all_proteins)

    # ====== only select certain protein ====== #
    if verbose:
        print("Datasets       :", ctext(', '.join(datasets), 'yellow'))
        print("Model          :", ctext(', '.join(models), 'yellow'))
        print("Shared proteins:", ctext(', '.join(all_proteins), 'yellow'))
        for name, ds in all_datasets:
            print(" ", ctext(name, 'cyan'))
            print("   X    :", ds['X'].shape)
            print("   X_col:", ds['X_col'])
            print("   y    :", ds['y'].shape)
            print("   y_col:", ', '.join(ds['y_col']))

    # ====== load all the model ====== #
    all_models = []
    for ds_name in datasets:
        if verbose:
            print("Search model for dataset '%s' ..." %
                  ctext(ds_name, 'yellow'))
        exp_path = os.path.join(EXP_DIR, ds_name)
        for model_name in os.listdir(exp_path):
            if model_name.split('_')[0] in models:
                path = os.path.join(exp_path, model_name, 'model.pkl')
                if os.path.exists(path):
                    all_models.append(path)
                    if verbose:
                        print(" ", ctext(model_name, 'cyan'))
    if verbose:
        print("%s datasets and %s models => %s experiments" % (
            ctext(len(all_datasets), 'yellow'),
            ctext(len(all_models), 'yellow'),
            ctext(len(all_datasets) * len(all_models), 'yellow'),
        ))

    # ====== create all necessary dir in advance ====== #
    all_data_name = [i[0] for i in all_datasets]
    all_model_name = [i.split('/')[-3] for i in all_models]
    for name1, name2 in product(all_data_name, all_model_name):
        path = os.path.join(
            outpath, 'data%s_model%s' %
            (name1.replace('_', '').upper(), name2.replace('_', '').upper()))
        if not os.path.exists(path):
            os.mkdir(path)
            if verbose:
                print("Create output folder:", ctext(path, 'yellow'))

    # ====== start generate analysis ====== #
    processes = []
    for ds_name, ds in all_datasets:
        y_true = {
            i: j
            for i, j in zip(ds['y_col'], ds['y'].T) if i in all_proteins
        }
        # preserve the same order of all_proteins
        y_true = np.hstack([y_true[i][:, np.newaxis] for i in all_proteins])

        for model_path in all_models:
            processes.append(
                Process(target=_analyze,
                        args=(ds_name, model_path, outpath, y_true,
                              all_proteins, verbose)))
            if len(processes) >= nprocess:
                [p.start() for p in processes]
                [p.join() for p in processes]
                processes = []

    # finish the remain processes
    if len(processes) > 0:
        [p.start() for p in processes]
        [p.join() for p in processes]
network = NetworkConfig(use_conv=True, pyramid=True, conv_proj=128)
kl = interpolation.const(vmax=1)
# kl = interpolation.linear(vmin=0,
#                           vmax=10,
#                           norm=20,
#                           cyclical=True,
#                           delayOut=5,
#                           delayIn=5)
# maximum amount of data points for testing (visualization)
n_samples_visualization = 300
DS_NAME = 'pbmc8kly'
# ===========================================================================
# Load data
# ===========================================================================
gene, prot = get_dataset(DS_NAME)
X_train, X_test = gene.split()
y_train, y_test = prot.split()
print("Labels:", prot.var)

gene_rv = RandomVariable(gene.n_vars, posterior='zinbd', name='rna')
prot_rv = RandomVariable(prot.n_vars, posterior='nb', name='adt')

# ====== prepare the labels ====== #
labels_name = standardize_protein_name(prot.var.iloc[:, 0].to_numpy())
if not y_test.is_binary:
  y_test.probabilistic_embedding()
  labels = np.argmax(y_test.obsm['X_prob'], axis=-1)
else:
  labels = np.argmax(y_test.X, axis=-1)
labels = np.array([labels_name[i] for i in labels])
示例#26
0
 def test_clustering(self):
     ds = get_dataset('8kmy')
     with catch_warnings_ignore(EfficiencyWarning):
         ds.clustering(algo='kmeans')
         ds.clustering(algo='knn')
示例#27
0
FIGURE_PATH = '/tmp/missing_protein'

corruption_rate = 0.25
corruption_dist = 'binomial'

n_epoch = 200
batch_size = 128

if not os.path.exists(FIGURE_PATH):
  os.mkdir(FIGURE_PATH)

# ===========================================================================
# Load dataset
# ===========================================================================
# for evaluating
ds_eval, gene_eval, prot_eval = get_dataset('cross8k_ly')
# for evaluating cross-dataset
ds_cross, gene_cross, prot_cross = get_dataset('crossecc_ly')

n_genes = gene_eval.feat_dim
eval_ds = dict(
    X_train=gene_eval.X_train,
    X_test=gene_eval.X_test,
    X_col=gene_eval.col_name,
    y_train=prot_eval.X_train,
    y_test=prot_eval.X_test,
    y_col=prot_eval.col_name)

cross_ds = dict(
    X_train=gene_cross.X_train,
    X_test=gene_cross.X_test,
示例#28
0
from sisua.analysis import Posterior
from sisua.data import OMIC, get_dataset, standardize_protein_name
from sisua.models import (SCVI, SISUA, DeepCountAutoencoder, NetworkConfig,
                          RandomVariable, VariationalAutoEncoder)

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

tf.random.set_seed(8)
np.random.seed(8)
# TODO: update this tutorial
# ===========================================================================
# Loading Data
# ===========================================================================
sco = get_dataset('8kly')
print(sco)
train, test = sco.split(train_percent=0.9)
n_genes = sco.numpy(OMIC.transcriptomic).shape[1]
n_prots = sco.numpy(OMIC.proteomic).shape[1]

gene_omic = RandomVariable(n_genes, posterior='zinb', name='rna')
prot_omic = RandomVariable(n_prots, posterior='nb', name='adt')
network = NetworkConfig(nlayers=1,
                        hidden_dim=64,
                        pyramid=True,
                        use_conv=False,
                        input_dropout=0.)
latent_dim = 12
epochs = 3
analytic = False
示例#29
0
from __future__ import print_function, division, absolute_import

from odin.stats import describe

from sisua.data import get_dataset
from sisua.label_threshold import ProbabilisticEmbedding

# ===========================================================================
# Load dataset
# ===========================================================================
FIGURE_PATH = '/tmp/tmp.pdf'

ds, _, _ = get_dataset('pbmc_citeseq')
protein = ds['y']
protein_name = ds['y_col']

print(protein.shape)
print(protein_name)

# ===========================================================================
# Probabilistic Embedding
# ===========================================================================
pb = ProbabilisticEmbedding(n_components_per_class=2,
                            positive_component=1,
                            log_norm=True,
                            clip_quartile=0.,
                            remove_zeros=True,
                            ci_threshold=-0.68,
                            random_state=5218,
                            verbose=True)
pb.fit(protein)