def regress(self): """ Execute StARS :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import lasso_stars_regress_dask return lasso_stars_regress_dask(self.X, self.Y, self.alphas, self.num_subsamples, self.random_seed, self.method, self.params, self.G, self.genes) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=self.genes[j], i=j, total=self.G), level=level) data = stars_model_select(self.X.values, utils.scale_vector(self.Y.get_gene_data(j, force_dense=True, flatten=True)), self.alphas, method=self.method, num_subsamples=self.num_subsamples, random_seed=self.random_seed, **self.params) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def regress(self): """ Execute Elastic Net :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import sklearn_regress_dask return sklearn_regress_dask(self.X, self.Y, self.model, self.G, self.genes, self.min_coef) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) data = sklearn_gene(self.X.values, utils.scale_vector( self.Y.get_gene_data(j, force_dense=True, flatten=True)), copy.copy(self.model), min_coef=self.min_coef) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def mutual_information(x, y, bins, logtype=DEFAULT_LOG_TYPE): """ Calculate the mutual information matrix between two data matrices, where the columns are equivalent conditions :param x: np.array (n x m1) The data from m1 variables across n conditions :param y: np.array (n x m2) The data from m2 variables across n conditions :param bins: int Number of bins to discretize continuous data into for the generation of a contingency table :param logtype: np.log func Which type of log function should be used (log2 results in MI bits, log results in MI nats, log10... is weird) :return mi: pd.DataFrame (m1 x m2) The mutual information between variables m1 and m2 """ # Discretize the input matrix y y = y.A if sps.isspmatrix(y) else y y = _make_array_discrete(y, bins, axis=0) # Build the MI matrix if MPControl.is_dask(): from inferelator.distributed.dask_functions import build_mi_array_dask return build_mi_array_dask(x, y, bins, logtype=logtype) else: return build_mi_array(x, y, bins, logtype=logtype)
def regress(self): """ Execute BBSR :return: pd.DataFrame [G x K], pd.DataFrame [G x K] Returns the regression betas and beta error reductions for all threads if this is the master thread (rank 0) Returns None, None if it's a subordinate thread """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import bbsr_regress_dask return bbsr_regress_dask(self.X, self.Y, self.pp, self.weights_mat, self.G, self.genes, self.nS) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) data = bayes_stats.bbsr( self.X.values, utils.scale_vector( self.Y.get_gene_data(j, force_dense=True, flatten=True)), self.pp.iloc[j, :].values.flatten(), self.weights_mat.iloc[j, :].values.flatten(), self.nS, ordinary_least_squares=self.ols_only) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def elasticnet_regress_dask(X, Y, params, G, genes, is_restart=False): """ Execute regression (ElasticNet) :return: list Returns a list of regression results that the pileup_data can process """ assert MPControl.is_dask() from inferelator.regression import elasticnet_python DaskController = MPControl.client def regression_maker(j, x, y): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) data = elasticnet_python.elastic_net(x, y, params=params) data['ind'] = j return j, data # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X.values], broadcast=True) # Wait for scattering to finish before creating futures try: distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) except distributed.TimeoutError: utils.Debug.vprint( "Scattering timeout during regression. Dask workers may be sick", level=0) future_list = [ DaskController.client.submit(regression_maker, i, scatter_x, Y.values[i, :].flatten()) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done try: result_list = process_futures_into_list(future_list) except KeyError: utils.Debug.vprint("Unrecoverable job error; restarting") if not is_restart: return elasticnet_regress_dask(X, Y, params, G, genes, is_restart=True) else: raise DaskController.client.cancel(scatter_x) return result_list
def bbsr_regress_dask(X, Y, pp_mat, weights_mat, G, genes, nS): """ Execute regression (BBSR) :return: list Returns a list of regression results that the pileup_data can process """ assert MPControl.is_dask() from inferelator.regression import bayes_stats DaskController = MPControl.client def regression_maker(j, x, y, pp, weights): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) data = bayes_stats.bbsr(x, utils.scale_vector(y), pp[j, :].flatten(), weights[j, :].flatten(), nS) data['ind'] = j return j, data # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X.values], broadcast=True, hash=False) [scatter_pp] = DaskController.client.scatter([pp_mat.values], broadcast=True, hash=False) [scatter_weights] = DaskController.client.scatter([weights_mat.values], broadcast=True, hash=False) # Wait for scattering to finish before creating futures distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) distributed.wait(scatter_pp, timeout=DASK_SCATTER_TIMEOUT) distributed.wait(scatter_weights, timeout=DASK_SCATTER_TIMEOUT) future_list = [ DaskController.client.submit( regression_maker, i, scatter_x, Y.get_gene_data(i, force_dense=True, flatten=True), scatter_pp, scatter_weights) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done result_list = process_futures_into_list(future_list) DaskController.client.cancel(scatter_x) DaskController.client.cancel(scatter_pp) DaskController.client.cancel(scatter_weights) return result_list
def lasso_stars_regress_dask(X, Y, alphas, num_subsamples, random_seed, method, params, G, genes): """ Execute regression (LASSO-StARS) :return: list Returns a list of regression results that the pileup_data can process """ assert MPControl.is_dask() from inferelator.regression import stability_selection DaskController = MPControl.client def regression_maker(j, x, y): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) data = stability_selection.stars_model_select( x, utils.scale_vector(y), alphas, num_subsamples=num_subsamples, method=method, random_seed=random_seed, **params) data['ind'] = j return j, data # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X.values], broadcast=True, hash=False) # Wait for scattering to finish before creating futures distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) future_list = [ DaskController.client.submit( regression_maker, i, scatter_x, Y.get_gene_data(i, force_dense=True, flatten=True)) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done result_list = process_futures_into_list(future_list) DaskController.client.cancel(scatter_x) return result_list
def mutual_information(X, Y, bins, logtype=DEFAULT_LOG_TYPE, temp_dir=None): """ Calculate the mutual information matrix between two data matrices, where the columns are equivalent conditions :param X: pd.DataFrame (m1 x n) The data from m1 variables across n conditions :param Y: pd.DataFrame (m2 x n) The data from m2 variables across n conditions :param bins: int Number of bins to discretize continuous data into for the generation of a contingency table :param logtype: np.log func Which type of log function should be used (log2 results in MI bits, log results in MI nats, log10... is weird) :param temp_dir: path Path to write temp files for multiprocessing :return mi: pd.DataFrame (m1 x m2) The mutual information between variables m1 and m2 """ assert check.indexes_align((X.columns, Y.columns)) # Create dense output matrix and copy the inputs mi_r = X.index mi_c = Y.index X = X.values Y = Y.values # Discretize the input matrixes X = _make_array_discrete(X.transpose(), bins, axis=0) Y = _make_array_discrete(Y.transpose(), bins, axis=0) # Build the MI matrix if MPControl.is_dask(): from inferelator.distributed.dask_functions import build_mi_array_dask return pd.DataFrame(build_mi_array_dask(X, Y, bins, logtype=logtype), index=mi_r, columns=mi_c) else: return pd.DataFrame(build_mi_array(X, Y, bins, logtype=logtype, temp_dir=temp_dir), index=mi_r, columns=mi_c)
def regress(self): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import amusr_regress_dask return amusr_regress_dask( self.X, self.Y, self.priors, self.prior_weight, self.n_tasks, self.genes, self.tfs, self.G, remove_autoregulation=self.remove_autoregulation) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) gene = self.genes[j] x, y, tasks = [], [], [] if self.remove_autoregulation: tfs = [t for t in self.tfs if t != gene] else: tfs = self.tfs for k in range(self.n_tasks): if gene in self.Y[k]: x.append(self.X[k].loc[:, tfs].values) # list([N, K]) y.append(self.Y[k].loc[:, gene].values.reshape( -1, 1)) # list([N, 1]) tasks.append(k) # [T,] prior = format_prior(self.priors, gene, tasks, self.prior_weight) return run_regression_EBIC(x, y, tfs, tasks, gene, prior) return MPControl.map(regression_maker, range(self.G))
def sklearn_regress_dask(X, Y, model, G, genes, min_coef): """ Execute regression (SKLearn) :return: list Returns a list of regression results that the pileup_data can process """ assert MPControl.is_dask() from inferelator.regression import sklearn_regression DaskController = MPControl.client def regression_maker(j, x, y): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) data = sklearn_regression.sklearn_gene(x, utils.scale_vector(y), copy.copy(model)) data['ind'] = j return j, data # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X.values], broadcast=True, hash=False) # Wait for scattering to finish before creating futures distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) future_list = [ DaskController.client.submit( regression_maker, i, scatter_x, Y.get_gene_data(i, force_dense=True, flatten=True)) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done result_list = process_futures_into_list(future_list) DaskController.client.cancel(scatter_x) return result_list
def regress(self): """ Execute Elastic Net :return: list Returns a list of regression results that base_regression's pileup_data can process """ if MPControl.is_dask(): from inferelator.distributed.dask_functions import elasticnet_regress_dask return elasticnet_regress_dask(self.X, self.Y, self.params, self.G, self.genes) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=self.genes[j], i=j, total=self.G), level=level) data = elastic_net(self.X.values, self.Y.iloc[j, :].values, self.params) data['ind'] = j return data return MPControl.map(regression_maker, range(self.G), tell_children=False)
def dask_map(func, *args, **kwargs): """ Dask map :param func: function to map :type func: callable :param args: positional arguments for func :type args: iterable :param kwargs: keyword (non-iterable) arguments for func. Keywords will be passed to dask client.map. :return: List of mapped results :rtype: list """ assert MPControl.is_dask() def _func_caller(f, i, *a, **k): return i, f(*a, **k) return process_futures_into_list([ MPControl.client.client.submit(_func_caller, func, i, *za, **kwargs) for i, za in enumerate(zip(*args)) ])
def regress(self, regression_function=None): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ regression_function = self.regression_function if regression_function is None else regression_function if MPControl.is_dask(): from inferelator.distributed.dask_functions import amusr_regress_dask return amusr_regress_dask( self.X, self.Y, self.priors, self.prior_weight, self.n_tasks, self.genes, self.tfs, self.G, remove_autoregulation=self.remove_autoregulation, regression_function=regression_function, tol=self.tol, rel_tol=self.rel_tol, use_numba=self.use_numba) def regression_maker(j): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format( gn=self.genes[j], i=j, total=self.G), level=level) gene = self.genes[j] x, y, tasks = [], [], [] if self.remove_autoregulation: tfs = [t for t in self.tfs if t != gene] else: tfs = self.tfs for k in range(self.n_tasks): if gene in self.Y[k].gene_names: x.append(self.X[k].get_gene_data(tfs)) # list([N, K]) y.append(self.Y[k].get_gene_data( gene, force_dense=True).reshape(-1, 1)) # list([N, 1]) tasks.append(k) # [T,] prior = format_prior(self.priors, gene, tasks, self.prior_weight, tfs=tfs) return regression_function(x, y, tfs, tasks, gene, prior, Cs=self.Cs, Ss=self.Ss, lambda_Bs=self.lambda_Bs, lambda_Ss=self.lambda_Ss, tol=self.tol, rel_tol=self.rel_tol, use_numba=self.use_numba) return MPControl.map(regression_maker, range(self.G))
def build_mi_array_dask(X, Y, bins, logtype): """ Calculate MI into an array with dask (the naive map is very inefficient) :param X: np.ndarray (n x m1) Discrete array of bins :param Y: np.ndarray (n x m2) Discrete array of bins :param bins: int The total number of bins that were used to make the arrays discrete :param logtype: np.log func Which log function to use (log2 gives bits, ln gives nats) :return mi: np.ndarray (m1 x m2) Returns the mutual information array """ assert MPControl.is_dask() from inferelator.regression.mi import _calc_mi, _make_table, _make_discrete # Get a reference to the Dask controller DaskController = MPControl.client m1, m2 = X.shape[1], Y.shape[1] def mi_make(i, x, y): x = _make_discrete(x, bins) return i, [ _calc_mi(_make_table(x, y[:, j], bins), logtype=logtype) for j in range(m2) ] # Scatter Y to workers and keep track as Futures [scatter_y] = DaskController.client.scatter([Y], broadcast=True, hash=False) # Wait for scattering to finish before creating futures distributed.wait(scatter_y, timeout=DASK_SCATTER_TIMEOUT) # Build an asynchronous list of Futures for each calculation of mi_make future_list = [ DaskController.client.submit( mi_make, i, X[:, i].A.flatten() if sps.isspmatrix(X) else X[:, i].flatten(), scatter_y) for i in range(m1) ] # Collect results as they finish instead of waiting for all workers to be done mi_list = process_futures_into_list(future_list) # Convert the list of lists to an array mi = np.array(mi_list) assert ( m1, m2) == mi.shape, "Array {sh} produced [({m1}, {m2}) expected]".format( sh=mi.shape, m1=m1, m2=m2) DaskController.client.cancel(scatter_y) return mi
def amusr_regress_dask(X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=True, lambda_Bs=None, lambda_Ss=None, Cs=None, Ss=None, regression_function=None, tol=None, rel_tol=None, use_numba=False): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ assert MPControl.is_dask() from inferelator.regression.amusr_regression import format_prior, run_regression_EBIC DaskController = MPControl.client # Allows injecting a regression function for testing regression_function = run_regression_EBIC if regression_function is None else regression_function # Gets genes, n_tasks, prior_weight, and remove_autoregulation from regress_dask() # Other arguments are passed in def regression_maker(j, x_df, y_list, prior, tf): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) gene = genes[j] x, y, tasks = [], [], [] if remove_autoregulation: tf = [t for t in tf if t != gene] else: pass for k, y_data in y_list: x.append(x_df[k].get_gene_data(tf)) # list([N, K]) y.append(y_data) tasks.append(k) # [T,] prior = format_prior(prior, gene, tasks, prior_weight, tfs=tf) return j, regression_function(x, y, tf, tasks, gene, prior, lambda_Bs=lambda_Bs, lambda_Ss=lambda_Ss, Cs=Cs, Ss=Ss, tol=tol, rel_tol=rel_tol, use_numba=use_numba) def response_maker(y_df, i): y = [] gene = genes[i] for k in range(n_tasks): if gene in y_df[k].gene_names: y.append( (k, y_df[k].get_gene_data(gene, force_dense=True).reshape(-1, 1))) return y # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X], broadcast=True, hash=False) [scatter_priors] = DaskController.client.scatter([priors], broadcast=True, hash=False) # Wait for scattering to finish before creating futures distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) distributed.wait(scatter_priors, timeout=DASK_SCATTER_TIMEOUT) future_list = [ DaskController.client.submit(regression_maker, i, scatter_x, response_maker(Y, i), scatter_priors, tfs) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done result_list = process_futures_into_list(future_list) DaskController.client.cancel(scatter_x) DaskController.client.cancel(scatter_priors) DaskController.client.restart() return result_list
def amusr_regress_dask(X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=True, is_restart=False): """ Execute multitask (AMUSR) :return: list Returns a list of regression results that the amusr_regression pileup_data can process """ assert MPControl.is_dask() from inferelator.regression.amusr_regression import format_prior, run_regression_EBIC DaskController = MPControl.client # Gets genes, n_tasks, prior_weight, and remove_autoregulation from regress_dask() # Other arguments are passed in def regression_maker(j, x_df, y_list, prior, tf): level = 0 if j % 100 == 0 else 2 utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j], i=j, total=G), level=level) gene = genes[j] x, y, tasks = [], [], [] if remove_autoregulation: tf = [t for t in tf if t != gene] else: pass for k, y_data in y_list: x.append(x_df[k].loc[:, tf].values) # list([N, K]) y.append(y_data) tasks.append(k) # [T,] del y_list prior = format_prior(prior, gene, tasks, prior_weight) return j, run_regression_EBIC(x, y, tf, tasks, gene, prior) def response_maker(y_df, i): y = [] gene = genes[i] for k in range(n_tasks): if gene in y_df[k]: y.append((k, y_df[k].loc[:, gene].values.reshape(-1, 1))) return y # Scatter common data to workers [scatter_x] = DaskController.client.scatter([X], broadcast=True) [scatter_priors] = DaskController.client.scatter([priors], broadcast=True) # Wait for scattering to finish before creating futures try: distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT) distributed.wait(scatter_priors, timeout=DASK_SCATTER_TIMEOUT) except distributed.TimeoutError: utils.Debug.vprint( "Scattering timeout during regression. Dask workers may be sick", level=0) future_list = [ DaskController.client.submit(regression_maker, i, scatter_x, response_maker(Y, i), scatter_priors, tfs) for i in range(G) ] # Collect results as they finish instead of waiting for all workers to be done try: result_list = process_futures_into_list(future_list) except KeyError: utils.Debug.vprint("Unrecoverable job error; restarting") if not is_restart: return amusr_regress_dask( X, Y, priors, prior_weight, n_tasks, genes, tfs, G, remove_autoregulation=remove_autoregulation, is_restart=True) else: raise DaskController.client.cancel(scatter_x) DaskController.client.cancel(scatter_priors) return result_list