def test_mofa(): D = [1000, 1000] # Number of features per view M = len(D) # Number of views K = 5 # Number of factors N = [100, 100] # Number of samples per group G = len(N) # Number of groups data_dt = pd.read_csv( "http://ftp.ebi.ac.uk/pub/databases/mofa/getting_started/data.txt.gz", sep="\t") ent = entry_point() ent.set_data_options(scale_groups=False, scale_views=False) ent.set_data_df(data_dt, likelihoods=["gaussian", "gaussian"]) ent.set_model_options(factors=10, spikeslab_weights=True, spikeslab_factors=True, ard_factors=True, ard_weights=True) ent.set_train_options(iter=1000, convergence_mode="fast", startELBO=1, freqELBO=1, dropR2=0.001, gpu_mode=True, verbose=False, seed=1) ent.build() ent.run() pdb.set_trace()
def run_mofa_plus(Y): K = 10 G = 1 M = 1 D = [Y.shape[1]] N = [Y.shape[0]] data_mat = [[None for g in range(G)] for m in range(M)] data_mat[0][0] = np.copy(Y) ent = entry_point() ent.set_data_options(scale_groups=False, use_float32=True, scale_views=False) ent.set_data_matrix(data_mat, likelihoods=["gaussian"]) ent.set_model_options(factors=K * 2, spikeslab_weights=True, ard_factors=True, ard_weights=True) ent.set_train_options(iter=300, convergence_mode="slow", startELBO=1, freqELBO=1, dropR2=0.001, nostop=True, gpu_mode=True, startSparsity=1, verbose=False, seed=1) ent.build() ent.run() pdb.set_trace() learned_U = (ent.model.nodes['Z'].getExpectations())['EN'] learned_S_U = (ent.model.nodes['Z'].getExpectations())['EB'] learned_V = (ent.model.nodes['W'].getExpectations())[0]['EN'] learned_S_V = (ent.model.nodes['W'].getExpectations())[0]['EB'] theta_w = ent.model.nodes['ThetaW'].getExpectations()[0]['E'] theta_z = ent.model.nodes['ThetaZ'].getExpectations()['E'] fit = { 'V': learned_V, 'V_S': learned_S_V, 'V_E': learned_V * learned_S_V, 'U': learned_U, 'U_S': learned_S_U, 'U_E': learned_U * learned_S_U, 'theta_V': theta_w, 'theta_U': theta_z } return fit
def train_MOFA(input_data, times, group_names, feature_names, sample_names, view_names, outfile, use_GP=True, model_groups=True, center_groups=False): from mofapy2.run.entry_point import entry_point # prepare MEFISTO model ent = entry_point() ent.set_data_options(center_groups=center_groups) ent.set_data_matrix(input_data, groups_names=group_names, features_names=feature_names, samples_names=sample_names, views_names=view_names) ent.set_model_options(factors=2) ent.set_train_options(seed=2020) if use_GP: ent.set_covariates(times, covariates_names="month") ent.set_smooth_options(model_groups=model_groups, warping=False, warping_ref=0, n_grid=10, opt_freq=50, start_opt=50) # opt_freq added for RCLR, set # Build and run the model ent.build() ent.run() # interpolate if use_GP: ent.predict_factor(new_covariates=ent.model.nodes["Sigma"].covariates) ent.save(outfile)
def test_build_basic(self): ent = entry_point() ent.set_data_options(scale_groups=False, scale_views=False) views_names = ["view1", "view2"] groups_names = ["groupA", "groupB"] # Set dimensions n_g1, n_g2 = 10, 20 d_m1, d_m2 = 30, 40 np.random.seed(42) ent.set_data_matrix([ [np.random.random((n_g1, d_m1)), np.random.random((n_g2, d_m1))], [np.random.random((n_g1, d_m2)), np.random.random((n_g2, d_m2))], ]) ent.set_model_options() ent.set_train_options() ent.build()
# datafile = "/hps/nobackup2/research/stegle/users/ricard/peer/data/FullFreeze_Corrected_iPSC_TPM2_20180626_hqS.txt.gz" datafile = "/g/stegle/ricard/peer/data/FullFreeze_Corrected_iPSC_TPM2_20180626_hqS.txt.gz" # The data has to be loaded as a pandas dataframe or as a numpy matrix with dimensions (samples,features) data = pd.read_csv(datafile, header=0, sep='\t', index_col=0) # Define likelihoods: non-gaussian likelihoods are implemented (poisson and bernoulli), but by default we use gaussian. lik = ["gaussian"] ########################### ## Initialise MOFA model ## ########################### # initialise the entry point ent = entry_point() # Set data options ent.set_data_options(likelihoods=lik) # Set data ent.set_data_matrix([[data]]) # do not modify this nested list # Set model options # - factors: number of factors # - spikeslab_weights: use spike-and-slab sparsity on the loading? # - ard_weights: use ARD prior on the loadings (please do not edit this) ent.set_model_options(factors=100, spikeslab_weights=False, ard_weights=False, likelihoods=lik)
def mofa( data: Union[AnnData, MuData], groups_label: bool = None, use_raw: bool = False, use_layer: bool = None, use_var: Optional[str] = "highly_variable", use_obs: Optional[str] = None, likelihoods: Optional[Union[str, List[str]]] = None, n_factors: int = 10, scale_views: bool = False, scale_groups: bool = False, center_groups: bool = True, ard_weights: bool = True, ard_factors: bool = True, spikeslab_weights: bool = True, spikeslab_factors: bool = False, n_iterations: int = 1000, convergence_mode: str = "fast", use_float32: bool = False, gpu_mode: bool = False, svi_mode: bool = False, svi_batch_size: float = 0.5, svi_learning_rate: float = 1.0, svi_forgetting_rate: float = 0.5, svi_start_stochastic: int = 1, smooth_covariate: Optional[str] = None, smooth_warping: bool = False, smooth_kwargs: Optional[Mapping[str, Any]] = None, save_parameters: bool = False, save_data: bool = True, save_metadata: bool = True, seed: int = 1, outfile: Optional[str] = None, expectations: Optional[List[str]] = None, save_interrupted: bool = True, verbose: bool = False, quiet: bool = True, copy: bool = False, ): """ Run Multi-Omics Factor Analysis PARAMETERS ---------- data an MuData object groups_label : optional a column name in adata.obs for grouping the samples use_raw : optional use raw slot of AnnData as input values use_layer : optional use a specific layer of AnnData as input values (supersedes use_raw option) use_var : optional .var column with a boolean value to select genes (e.g. "highly_variable"), None by default use_obs : optional strategy to deal with samples (cells) not being the same across modalities ("union" or "intersection", throw error by default) likelihoods : optional likelihoods to use, default is guessed from the data n_factors : optional number of factors to train the model with scale_views : optional scale views to unit variance scale_groups : optional scale groups to unit variance center_groups : optional center groups to zero mean (True by default) ard_weights : optional use view-wise sparsity ard_factors : optional use group-wise sparsity spikeslab_weights : optional use feature-wise sparsity (e.g. gene-wise) spikeslab_factors : optional use sample-wise sparsity (e.g. cell-wise) n_iterations : optional upper limit on the number of iterations convergence_mode : optional fast, medium, or slow convergence mode use_float32 : optional use reduced precision (float32) gpu_mode : optional if to use GPU mode svi_mode : optional if to use Stochastic Variational Inference (SVI) svi_batch_size : optional batch size as a fraction (only applicable when svi_mode=True, 0.5 by default) svi_learning_rate : optional learning rate (only applicable when svi_mode=True, 1.0 by default) svi_forgetting_rate : optional forgetting_rate (only applicable when svi_mode=True, 0.5 by default) svi_start_stochastic : optional first iteration to start SVI (only applicable when svi_mode=True, 1 by default) smooth_covariate : optional use a covariate (column in .obs) to learn smooth factors (MEFISTO) smooth_warping : optional if to learn the alignment of covariates (e.g. time points) from different groups; by default, the first group is used as a reference, which can be adjusted by setting the REF_GROUP in smooth_kwargs = { "warping_ref": REF_GROUP } (MEFISTO) smooth_kwargs : optional additional arguments for MEFISTO (covariates_names, scale_cov, start_opt, n_grid, opt_freq, warping_freq, warping_ref, warping_open_begin, warping_open_end, sparseGP, frac_inducing, model_groups, new_values) save_parameters : optional if to save training parameters save_data : optional if to save training data save_metadata : optional if to load metadata from the AnnData object (.obs and .var tables) and save it, False by default seed : optional random seed outfile : optional path to HDF5 file to store the model expectations : optional which nodes should be used to save expectations for (will save only W and Z by default); possible expectations names nclude Y, W, Z, Tau, AlphaZ, AlphaW, ThetaW, ThetaZ save_interrupted : optional if to save partially trained model when the training is interrupted verbose : optional print verbose information during traing quiet : optional silence messages during training procedure copy : optional return a copy of AnnData instead of writing to the provided object """ try: from mofapy2.run.entry_point import entry_point except ImportError: raise ImportError( "MOFA+ is not available. Install MOFA+ from PyPI (`pip install mofapy2`) or from GitHub (`pip install git+https://github.com/bioFAM/MOFA2`)" ) if isinstance(data, AnnData): logging.info("Wrapping an AnnData object into an MuData container") mdata = MuData(data) # Modality name is used as a prefix by default mdata.obs = data.obs elif isinstance(data, MuData): mdata = data else: raise TypeError("Expected an MuData object") if outfile is None: outfile = os.path.join("/tmp", "mofa_{}.hdf5".format(strftime("%Y%m%d-%H%M%S"))) if use_var: if use_var not in data.var.columns: warn(f"There is no column {use_var} in the provided object") use_var = None if isinstance(data, MuData): common_obs = reduce(np.intersect1d, [v.obs_names.values for k, v in mdata.mod.items()]) if len(common_obs) != mdata.n_obs: if not use_obs: raise IndexError( "Not all the observations are the same across modalities. Please run `mdata.intersect_obs()` to subset the data or devise a strategy with `use_obs` ('union' or 'intersection')" ) elif use_obs not in ["union", "intersection"]: raise ValueError( f"Expected `use_obs` argument to be 'union' or 'intersection', not '{use_obs}'" ) else: use_obs = None ent = entry_point() lik = likelihoods if lik is not None: if isinstance(lik, str) and isinstance(lik, Iterable): lik = [lik for _ in range(len(mdata.mod))] ent.set_data_options( scale_views=scale_views, scale_groups=scale_groups, center_groups=center_groups, use_float32=use_float32, ) logging.info( f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting data from MuData object..." ) _set_mofa_data_from_mudata( model=ent, mdata=mdata, groups_label=groups_label, use_raw=use_raw, use_layer=use_layer, likelihoods=lik, features_subset=use_var, save_metadata=save_metadata, use_obs=use_obs, ) logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting model options...") ent.set_model_options( ard_factors=ard_factors, ard_weights=ard_weights, spikeslab_weights=spikeslab_weights, spikeslab_factors=spikeslab_factors, factors=n_factors, ) logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting training options...") ent.set_train_options( iter=n_iterations, convergence_mode=convergence_mode, gpu_mode=gpu_mode, seed=seed, verbose=verbose, quiet=quiet, outfile=outfile, save_interrupted=save_interrupted, ) if svi_mode: logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Setting up SVI...") ent.set_stochastic_options( learning_rate=svi_learning_rate, forgetting_rate=svi_forgetting_rate, batch_size=svi_batch_size, start_stochastic=svi_start_stochastic, ) # MEFISTO options smooth_kwargs_default = dict( covariates_names=smooth_covariate, scale_cov=False, start_opt=20, n_grid=20, opt_freq=10, model_groups=True, warping_freq=20, warping_ref=0, warping_open_begin=True, warping_open_end=True, sparseGP=False, frac_inducing=None, new_values=None, ) if not smooth_kwargs: smooth_kwargs = {} # warping_ref has to be an integer if "warping_ref" in smooth_kwargs: warping_ref = smooth_kwargs["warping_ref"] if not (isinstance("warping_ref", int)): warping_ref = np.where(np.array(ent.data_opts["groups_names"]) == warping_ref)[0] if len(warping_ref) == 0: raise KeyError( f"Expected 'warping_ref' for be a group name but there is no group {warping_ref}" ) smooth_kwargs["warping_ref"] = warping_ref[0] # Add default options where they are not provided smooth_kwargs = {**smooth_kwargs_default, **smooth_kwargs} if smooth_covariate is not None: logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Adding smooth options...") ent.set_covariates(smooth_covariate, covariates_names=smooth_kwargs["covariates_names"]) ent.set_smooth_options( scale_cov=smooth_kwargs["scale_cov"], start_opt=smooth_kwargs["start_opt"], n_grid=smooth_kwargs["n_grid"], opt_freq=smooth_kwargs["opt_freq"], model_groups=smooth_kwargs["model_groups"], warping=smooth_warping, warping_freq=smooth_kwargs["warping_freq"], warping_ref=smooth_kwargs["warping_ref"], warping_open_begin=smooth_kwargs["warping_open_begin"], warping_open_end=smooth_kwargs["warping_open_end"], sparseGP=smooth_kwargs["sparseGP"], frac_inducing=smooth_kwargs["frac_inducing"], ) logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Building the model...") ent.build() logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Running the model...") ent.run() if ( smooth_kwargs is not None and "new_values" in smooth_kwargs and smooth_kwargs["new_values"] and smooth_covariate ): logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Interpolating factors...") new_values = np.array(smooth_kwargs["new_values"]) if new_values.ndim == 1: new_values = new_values.reshape(-1, 1) ent.predict_factor(new_covariates=new_values) logging.info(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] Saving the model...") ent.save( outfile, save_data=save_data, save_parameters=save_parameters, expectations=expectations ) f = h5py.File(outfile, "r") if copy: data = data.copy() # Factors z = np.concatenate([v[:, :] for k, v in f["expectations"]["Z"].items()], axis=1).T # Samples are grouped per sample group # so the rows of the Z matrix have to be re-ordered if groups_label: zs = np.concatenate([v[:] for k, v in f["samples"].items()], axis=0).astype(str) if use_obs and use_obs == "intersection": # data is MuData and common_obs is available if groups_label: z = pd.DataFrame(z, index=zs).loc[common_obs].to_numpy() # Set factor values outside of the obs intersection to nan data.obsm["X_mofa"] = np.empty(shape=(data.n_obs, z.shape[1])) data.obsm["X_mofa"][:] = np.nan # Samples data.obsm["X_mofa"][data.obs.index.isin(common_obs)] = z else: if groups_label: z = pd.DataFrame(z, index=zs).loc[mdata.obs.index.values].to_numpy() data.obsm["X_mofa"] = z # Weights w = np.concatenate([v[:, :] for k, v in f["expectations"]["W"].items()], axis=1).T if use_var: # Set the weights of features that were not used to zero data.varm["LFs"] = np.zeros(shape=(data.n_vars, w.shape[1])) data.varm["LFs"][data.var[use_var]] = w else: data.varm["LFs"] = w # Aligned times if smooth_covariate is not None and smooth_warping: for c in range(ent.dimensionalities["C"]): cnm = ent.smooth_opts["covariates_names"][c] + "_warped" cval = ent.model.getNodes()["Sigma"].sample_cov_transformed[:, c] if groups_label: cval = pd.DataFrame(cval, index=zs).loc[common_obs].to_numpy() data.obs[cnm] = cval # Parameters data.uns["mofa"] = { "params": { "data": { "groups_label": groups_label, "use_raw": use_raw, "use_layer": use_layer, "likelihoods": f["model_options"]["likelihoods"][:].astype(str), "features_subset": use_var, "use_obs": use_obs, "scale_views": scale_views, "scale_groups": scale_groups, "center_groups": center_groups, "use_float32": use_float32, }, "model": { "ard_factors": ard_factors, "ard_weights": ard_weights, "spikeslab_weights": spikeslab_weights, "spikeslab_factors": spikeslab_factors, "n_factors": n_factors, }, "training": { "n_iterations": n_iterations, "convergence_mode": convergence_mode, "gpu_mode": gpu_mode, "seed": seed, }, } } # Variance explained try: views = f["views"]["views"][:].astype(str) variance_per_group = f["variance_explained"]["r2_per_factor"] variance = {m: {} for m in views} groups = f["groups"]["groups"][:].astype(str) if len(groups) > 1: for group in list(variance_per_group.keys()): for i, view in enumerate(views): variance[view][group] = variance_per_group[group][i, :] else: for i, view in enumerate(views): variance[view] = variance_per_group[groups[0]][i, :] data.uns["mofa"]["variance"] = variance except: warn("Cannot save variance estimates") f.close() if copy: return data else: print("Saved MOFA embeddings in .obsm['X_mofa'] slot and their loadings in .varm['LFs'].") return None
def run_grid(nfactors = 3, G = 5, N = 20, Dm = 500, noise_level = 1, missing = 0.1, missing_all = 0.1, seed = 1234567, method = "MEFISTO", note = "none", lscales = [0.2, 0.1, 0.0], scales = [1, 0.6, 0], M = 4, plot = False, max_iter = 1000, verbose = False, sparse_frac = 0.75, warp = False, save = False, group_differences = True, model_groups = True): nfactors = int(nfactors) assert len(lscales) == nfactors assert len(scales) == nfactors groupsidx = np.repeat(range(G), N) # simulate data np.random.seed(seed) if group_differences: if nfactors == 3: sharedness = np.random.choice([True, False], 2, replace=False).tolist() + [False] # one shared, one non-shared, one non-smooth else: sharedness = True # not in use else: sharedness = True sim = simmofa.simulate_data(N=N, seed=seed, views=["0", "1", "2", "3"], D=[Dm] * M, K=nfactors, G=G, lscales=lscales, noise_level=noise_level, scales = scales, shared = sharedness) # mask parts of the data data_full = copy.deepcopy(sim['data']) sim['data'] = simmofa.mask_samples(sim, perc = missing, perc_all_views = missing_all) # misalign covariates between groups if warp: assert G == 3, "Warping defined only for G=3" sim['sample_cov'][1] = np.exp(sim['sample_cov'][1]) sim['sample_cov'][2] = 0.4 * sim['sample_cov'][2] + 0.3 # optional plotting of simulated factors if plot: fig, axs = plt.subplots(1, nfactors) Zsim = sim['Z'] for g in range(G): for i in range(nfactors): axs[i].scatter(sim['sample_cov'][g], Zsim[g][:, i]) axs[i].set_title("simulated factors") # prepare model ent = entry_point() ent.set_data_options(scale_views=False) ent.set_data_matrix(sim['data']) ent.set_model_options(factors=nfactors) ent.set_train_options(seed=2020, convergence_mode="fast", iter=max_iter, verbose=verbose) # for time-aware multi-modal FA with GP model add covariates if not method == "MOFA2": ent.set_covariates(sim['sample_cov']) if method == "MEFISTO+align": ent.set_smooth_options(warping=True, model_groups = model_groups) elif method == "MEFISTO_sparse": ent.set_smooth_options(model_groups = model_groups, sparseGP = True, n_inducing= int((N * G) * sparse_frac)) else: ent.set_smooth_options(model_groups = model_groups) # run and build the model tracemalloc.start() ent.build() t0 = time.time() ent.run() t1 = time.time() total = t1 - t0 current, peak = tracemalloc.get_traced_memory() tracemalloc.stop() # get inferred hyperparameters if method != "MOFA2": scales_learnt = ent.model.train_stats['scales'] lscales_learnt = ent.model.train_stats['length_scales'] else: scales_learnt = np.array([np.nan] * nfactors) lscales_learnt = np.array([np.nan] * nfactors) # get factors recovery error Zlearnt = ent.model.getExpectations()['Z']['E'] # calculate factor recovery error # if not right number of factors inferred set to nan if not Zlearnt.shape[1] == nfactors: factor_r2 = np.nan scales_learnt = [np.nan] * nfactors lscales_learnt = [np.nan] * nfactors post_var = [np.nan] * nfactors factor_idx = [np.nan] * nfactors else: Zsim = np.vstack(sim['Z']) # get idx of learnt factor corresponding to simulated factor by maximal correlation factor_idx = [np.argmax([abs(ss.pearsonr(Zsim[:,p], Zlearnt[:,pp])[0]) for pp in range(nfactors)]) for p in range(nfactors)] #check for duplicates - if true not all facotrs are captured on a unique factor if not len(factor_idx) == len(set(factor_idx)): factor_r2 = np.nan else: # calculate correlation between inferred and simulated factors factor_r2 = np.mean([np.max([abs(ss.pearsonr(Zsim[:, pp], Zlearnt[:, p])[0]) for pp in range(nfactors)]) for p in range(nfactors)]) ** 2 if method != "MOFA2": scales_learnt = scales_learnt[factor_idx] # match to simulated factor lscales_learnt = lscales_learnt[factor_idx] if verbose: print(scales_learnt) # get posterior variance post_var = ent.model.getExpectations()['Z']['E2'] - (ent.model.getExpectations()['Z']['E']) ** 2 post_var = post_var.mean(axis = 0) # get imputation error ent.impute(mask_outliers = False) mse = 0 n_missing = 0 imp_r2 = 1 if missing + missing_all > 0: for m in range(M): mse_m = ((ent.imputed_data["mean"][m][ent.model.nodes['Y'].getNodes()[m].getMask()] - np.vstack(data_full[m])[ent.model.nodes['Y'].getNodes()[m].getMask()])**2).sum() mse = mse + mse_m n_missing = n_missing + ent.model.nodes['Y'].getNodes()[m].getMask().sum() mse = mse / n_missing imp_r2 = np.mean([ss.pearsonr(np.vstack(data_full[m])[ent.model.nodes['Y'].getNodes()[m].getMask()].flatten(), ent.imputed_data["mean"][m][ent.model.nodes['Y'].getNodes()[m].getMask()].flatten())[0] ** 2 for m in range(M)]) rec_r2 = np.mean([ss.pearsonr(np.vstack(data_full[m]).flatten(), ent.imputed_data["mean"][m].flatten())[0] ** 2 for m in range(M)] ) # get warping error if method == "MEFISTO+align": sample_cov_transformed = ent.model.getNodes()['Sigma'].sample_cov_transformed # compare to untransformed group warp_mse = sum([sum((sample_cov_transformed[groupsidx == g] - sim['sample_cov'][0])**2) for g in range(G)]) / (N * G) else: # no transformation made warp_mse = sum([sum((sim['sample_cov'][g] - sim['sample_cov'][0])**2) for g in range(G)]) / (N * G) warp_mse = warp_mse[0] # get group covariance error: if group_differences and len(factor_idx) == len(set(factor_idx)): if "Sigma" in ent.model.nodes.keys() and model_groups: Gmat_learnt = ent.model.nodes['Sigma'].getParameters()['Kg'] # MEFISTO without model_groups assumes all groups to be connected elif "Sigma" in ent.model.nodes.keys(): Gmat_learnt = [np.ones([G,G])] * nfactors # MOFA2 assumes all groups to be unconnected else: Gmat_learnt = [np.eye(G)] * nfactors # get sharedness error true_sharedness = [np.mean(np.abs(sim['Gmats'][k] - np.eye(G))[np.triu_indices(G, 1)]) for k in range(nfactors)] inferred_sharedness = [np.mean(np.abs(Gmat_learnt[factor_idx[k]] - np.eye(G))[np.triu_indices(G, 1)]) for k in range(nfactors)] # if no group covariance was simulated set to nan else: true_sharedness = [np.nan] * nfactors inferred_sharedness = [np.nan] * nfactors # write output to csv results = {'factor_r2' : factor_r2, 'time': total, 'method' : method, 'model_groups' : model_groups, 'group_differences': group_differences, 'N' : N, 'G': G, 'Dm' : Dm, 'noise_level': noise_level, 'missing' : missing + missing_all, 'seed' : seed, 'date' : date.today(), 'note' : note, 'mem_usage': peak, 'lscales' : lscales, 'scales' : scales, 'sparse_frac' : sparse_frac, 'n_factors' : nfactors, 'warp_mse' : warp_mse, 'n_factors_learnt' : Zlearnt.shape[1], 'scales_learnt' : scales_learnt, 'lscales_learnt' : lscales_learnt, 'true_sharedness' : np.array(true_sharedness), 'inferred_sharedness' : np.array(inferred_sharedness), 'post_var' : post_var, 'mse' : mse, 'imp_r2' : imp_r2, 'rec_r2' : rec_r2} if verbose: print(results) df = pd.DataFrame.from_dict(data=results, orient='index').T # expand multi-factor columns for nm in ['scales', 'lscales', 'scales_learnt', 'lscales_learnt', 'true_sharedness', 'inferred_sharedness', 'post_var']: dfsplit = df[nm].apply(pd.Series) dfsplit = dfsplit.rename(columns=lambda x: nm + "_" + str(x)) df = pd.concat([df, dfsplit], axis=1) df = df.drop(columns = [nm]) # optional plotting of inferred factors if plot: Zlearnt = ent.model.getExpectations()['Z']['E'] fig, axs = plt.subplots(1, nfactors) for g in range(G): for i in range(nfactors): axs[i].scatter(sim['sample_cov'][g], Zlearnt[groupsidx == g, i]) axs[i].set_title("inferred factors") # save summary statistics if not the model itself is saved if not save: if os.path.exists('out/simulation_results.csv'): df.to_csv('out/simulation_results.csv', mode='a', header=False) else: df.to_csv('out/simulation_results.csv', header=True) else: ent.save("out/grid_model.hdf5")
args = parser.parse_args() if type(args.logfile) == str: logging.basicConfig(level=logging.INFO, filename=args.logfile) else: logging.basicConfig(level=logging.INFO) logging.info("Setting up numpy multi-threading. Using {} threads".format( args.use_threads)) os.environ['OPENBLAS_NUM_THREADS'] = str(args.use_threads) logging.info("Reading in ADT counts matrices") # initial the model entry point mofa_ent = entry_point() # each matrix should be gzipped file_list = args.counts_files.split(",") # read in the donor data frames as dict for reference # must be the same number of files unless running joint mode logging.info("Readining in donor information: {}".format(args.donor_files)) donor_list = args.donor_files.split(",") if len(donor_list) != len(file_list): if args.omic == 'joint' and len(donor_list) == 1: donor_names = ["Gene", "ADT"] donor_df_list = [ pd.read_table(D, sep="\t", header=0, index_col='CellID') for D in donor_list
def __init__( self, views=None, groupby=None, likelihoods=None, factors_n=10, covariates=None, fit_intercept=True, scale_views=True, scale_groups=True, iterations=1000, convergence_mode="slow", use_overlap=True, startELBO=1, freqELBO=1, dropR2=None, verbose=1, from_file=None, ): """ This is a wrapper of MOFA to perform multi-omics integrations of the GDSC data-sets. - Multiple groups are NOT supported - Only samples part of all views are considered :param views: dict(str: pandas.DataFrame) """ self.verbose = verbose self.from_file = from_file self.factors_n = factors_n self.factors_labels = [f"F{i + 1}" for i in range(factors_n)] self.likelihoods = likelihoods self.views = views self.scale_views = scale_views self.scale_groups = scale_groups self.views_labels = list(self.views) self.use_overlap = use_overlap self.iterations = iterations self.convergence_mode = convergence_mode self.startELBO = startELBO self.freqELBO = freqELBO self.dropR2 = dropR2 # Covariates self.covariates = covariates self.fit_intercept = fit_intercept # Samples self.samples = set.intersection( *[set(self.views[v]) for v in self.views_labels]) if self.covariates is not None: LOG.info(f"Covariates provided N={len(self.covariates)}") for k, v in self.covariates.items(): self.samples = self.samples.intersection(set(v.index)) self.samples = list(self.samples) LOG.info(f"Overlaping samples: {len(self.samples)}") # Reduce to overlaping samples if self.use_overlap: for k, df in self.views.items(): self.views[k] = df[self.samples] # Info for k, df in self.views.items(): LOG.info(f"View {k}: {df.shape}") # Regress-out covariates if self.covariates is not None: self.views = self.regress_out_covariates( fit_intercept=self.fit_intercept) # RUN MOFA # Prepare data, melt & tidy self.data = [] for k in self.views_labels: df = self.views[k].copy() df.index.name = "feature" df.columns.name = "sample" df = df.unstack().rename("value").reset_index() if groupby is not None: df["group"] = groupby.reindex(df["sample"]).values else: df = df.assign(group="gdsc") self.data.append(df.assign(view=k)) self.data = pd.concat(self.data, ignore_index=True) self.data = self.data[["sample", "group", "feature", "value", "view"]].dropna() # Initialise entry point self.ep = entry_point() # Set data options self.ep.set_data_options(scale_groups=self.scale_groups, scale_views=self.scale_views) self.ep.set_data_df(self.data, likelihoods=self.likelihoods) # Set model options self.ep.set_model_options(factors=self.factors_n) # Set training options self.ep.set_train_options( iter=self.iterations, convergence_mode=self.convergence_mode, startELBO=self.startELBO, freqELBO=self.freqELBO, dropR2=self.dropR2, verbose=verbose > 0, ) # Run MOFA self.ep.build() if not os.path.isfile(self.from_file): self.ep.run() self.save_hdf5(self.from_file) self.mofa_file = h5py.File(self.from_file, "r") self.factors = self.get_factors(self.mofa_file) self.weights = self.get_weights(self.mofa_file) self.rsquare = self.get_rsquare(self.mofa_file)
def run_evodevo(nfactors=5, Ndown=3, warp=False, save=True, warping_ref="Mouse", sample_seed=4891, seed=2020, species=["Mouse", "Rabbit", "Rat", "Human", "Opossum"], views=["Brain", "Cerebellum", "Heart", "Liver", "Testis"], model_groups=True, nm=None, tissue_as_sample=False): if tissue_as_sample: assert not warp, "Need to adapt warping reference if tissues are treated as groups" # specify data directory of normalized gene expression data if species == ["Mouse", "Rabbit", "Rat"] and not warp: nmtmp = "MRRab" datadir = "data/input_data/MRRab_matched/" elif warp: nmtmp = "warping" datadir = "data/input_data/all_unmatched/" else: print("Matched inputs are only provided for [Mouse, Rabbit, Rat]") sys.exit() # set filenames for output if nm is not None: nm = nm else: nm = nmtmp # load data and covariate data = [] times = [] samples_names = [] if tissue_as_sample: group_names = [] data_view = [] for m in views: for g in species: df = pd.read_csv(datadir + "view_" + m + "_group_" + g + ".csv", header=0, index_col=0) data_view.append(np.asarray(df).transpose()) times.append( np.asarray( pd.read_csv(datadir + "times_group_" + g + ".csv", header=0, index_col=0)).transpose()) samples_names.append(df.columns) group_names.append(m + "-" + g) data = [data_view] features_names = [df.index] else: for m in views: data_view = [] for g in species: data_view.append( np.asarray( pd.read_csv(datadir + "view_" + m + "_group_" + g + ".csv", header=0, index_col=0)).transpose()) if m == "Brain": # only needed once times.append( np.asarray( pd.read_csv(datadir + "times_group_" + g + ".csv", header=0, index_col=0)).transpose()) data.append(data_view) # convert warping ref to numeric warping_ref = np.where( [species[i] == warping_ref for i in range(len(species))])[0][0] # mask values at random if Ndown > 0: np.random.seed(sample_seed) if tissue_as_sample: for i in range(len(data[0])): Ng = data[0][i].shape[0] masked_samples = np.random.choice(Ng, Ndown, replace=False) data[0][i][masked_samples, :] = np.nan else: for m in range(len(views)): for g in range(len(species)): Ng = data[m][g].shape[0] masked_samples = np.random.choice(Ng, Ndown, replace=False) data[m][g][masked_samples, :] = np.nan # check dimension and name views and groups if tissue_as_sample: assert len(data) == 1, "problem in loading data, wrong number of views" assert len(data[0]) == len(species) * len( views), "problem in loading data, wrong number of groups" view_names = ["mRNA"] else: assert len(data) == len( views), "problem in loading data, wrong number of views" assert len(data[0]) == len( species), "problem in loading data, wrong number of groups" view_names = views group_names = species # prepare MOFA model with time as covariate ent = entry_point() ent.set_data_options() ent.set_data_matrix(data, groups_names=group_names, views_names=view_names) ent.set_model_options(factors=nfactors) ent.set_train_options(seed=seed, convergence_mode="medium") ent.set_covariates(times, covariates_names="time") ent.set_smooth_options(warping=warp, warping_ref=warping_ref, model_groups=model_groups) # Build and run the model tracemalloc.start() ent.build() t0 = time.time() ent.run() t1 = time.time() total = t1 - t0 current, peak = tracemalloc.get_traced_memory() tracemalloc.stop() # save model if save: if Ndown == 0: if model_groups: outfile = "out/evodevo_groups_%s-seed_%s.hdf5" % (nm, seed) else: outfile = "out/evodevo_%s-seed_%s.hdf5" % (nm, seed) # interpolate for missing time points ent.predict_factor( new_covariates=ent.model.nodes["Sigma"].covariates) else: if model_groups: outfile = "out/evodevo_groups_%s-N%s-sample_seed_%s.hdf5" % ( nm, Ndown, sample_seed) else: outfile = "out/evodevo_%s-N%s-sample_seed_%s.hdf5" % ( nm, Ndown, sample_seed) ent.save(outfile) # write output to csv results = { 'time': total, 'mem_usage': peak, 'n_down': Ndown, 'sample_seed': sample_seed, 'seed': seed } df = pd.DataFrame.from_dict(data=results, orient='index').T if model_groups: stats_file = 'out/evodevo_groups_%s_stats.csv' % nm else: stats_file = 'out/evodevo_%s_stats.csv' % nm if os.path.exists(stats_file): df.to_csv(stats_file, mode='a', header=False) else: df.to_csv(stats_file, header=True)