Пример #1
0
    def regress(self):
        """
        Execute StARS

        :return: list
            Returns a list of regression results that base_regression's pileup_data can process
        """

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import lasso_stars_regress_dask
            return lasso_stars_regress_dask(self.X, self.Y, self.alphas, self.num_subsamples, self.random_seed,
                                            self.method, self.params, self.G, self.genes)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=self.genes[j], i=j, total=self.G), level=level)

            data = stars_model_select(self.X.values,
                                      utils.scale_vector(self.Y.get_gene_data(j, force_dense=True, flatten=True)),
                                      self.alphas,
                                      method=self.method,
                                      num_subsamples=self.num_subsamples,
                                      random_seed=self.random_seed,
                                      **self.params)
            data['ind'] = j
            return data

        return MPControl.map(regression_maker, range(self.G), tell_children=False)
    def regress(self):
        """
        Execute Elastic Net

        :return: list
            Returns a list of regression results that base_regression's pileup_data can process
        """

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import sklearn_regress_dask
            return sklearn_regress_dask(self.X, self.Y, self.model, self.G,
                                        self.genes, self.min_coef)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(
                gn=self.genes[j], i=j, total=self.G),
                                 level=level)

            data = sklearn_gene(self.X.values,
                                utils.scale_vector(
                                    self.Y.get_gene_data(j,
                                                         force_dense=True,
                                                         flatten=True)),
                                copy.copy(self.model),
                                min_coef=self.min_coef)
            data['ind'] = j
            return data

        return MPControl.map(regression_maker,
                             range(self.G),
                             tell_children=False)
Пример #3
0
def mutual_information(x, y, bins, logtype=DEFAULT_LOG_TYPE):
    """
    Calculate the mutual information matrix between two data matrices, where the columns are equivalent conditions

    :param x: np.array (n x m1)
        The data from m1 variables across n conditions
    :param y: np.array (n x m2)
        The data from m2 variables across n conditions
    :param bins: int
        Number of bins to discretize continuous data into for the generation of a contingency table
    :param logtype: np.log func
        Which type of log function should be used (log2 results in MI bits, log results in MI nats, log10... is weird)

    :return mi: pd.DataFrame (m1 x m2)
        The mutual information between variables m1 and m2
    """

    # Discretize the input matrix y
    y = y.A if sps.isspmatrix(y) else y
    y = _make_array_discrete(y, bins, axis=0)

    # Build the MI matrix
    if MPControl.is_dask():
        from inferelator.distributed.dask_functions import build_mi_array_dask
        return build_mi_array_dask(x, y, bins, logtype=logtype)
    else:
        return build_mi_array(x, y, bins, logtype=logtype)
Пример #4
0
    def regress(self):
        """
        Execute BBSR

        :return: pd.DataFrame [G x K], pd.DataFrame [G x K]
            Returns the regression betas and beta error reductions for all threads if this is the master thread (rank 0)
            Returns None, None if it's a subordinate thread
        """

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import bbsr_regress_dask
            return bbsr_regress_dask(self.X, self.Y, self.pp, self.weights_mat,
                                     self.G, self.genes, self.nS)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(
                gn=self.genes[j], i=j, total=self.G),
                                 level=level)

            data = bayes_stats.bbsr(
                self.X.values,
                utils.scale_vector(
                    self.Y.get_gene_data(j, force_dense=True, flatten=True)),
                self.pp.iloc[j, :].values.flatten(),
                self.weights_mat.iloc[j, :].values.flatten(),
                self.nS,
                ordinary_least_squares=self.ols_only)
            data['ind'] = j
            return data

        return MPControl.map(regression_maker,
                             range(self.G),
                             tell_children=False)
def elasticnet_regress_dask(X, Y, params, G, genes, is_restart=False):
    """
    Execute regression (ElasticNet)

    :return: list
        Returns a list of regression results that the pileup_data can process
    """
    assert MPControl.is_dask()

    from inferelator.regression import elasticnet_python
    DaskController = MPControl.client

    def regression_maker(j, x, y):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)
        data = elasticnet_python.elastic_net(x, y, params=params)
        data['ind'] = j
        return j, data

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X.values], broadcast=True)

    # Wait for scattering to finish before creating futures
    try:
        distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)
    except distributed.TimeoutError:
        utils.Debug.vprint(
            "Scattering timeout during regression. Dask workers may be sick",
            level=0)

    future_list = [
        DaskController.client.submit(regression_maker, i, scatter_x,
                                     Y.values[i, :].flatten())
        for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    try:
        result_list = process_futures_into_list(future_list)
    except KeyError:
        utils.Debug.vprint("Unrecoverable job error; restarting")
        if not is_restart:
            return elasticnet_regress_dask(X,
                                           Y,
                                           params,
                                           G,
                                           genes,
                                           is_restart=True)
        else:
            raise

    DaskController.client.cancel(scatter_x)

    return result_list
def bbsr_regress_dask(X, Y, pp_mat, weights_mat, G, genes, nS):
    """
    Execute regression (BBSR)

    :return: list
        Returns a list of regression results that the pileup_data can process
    """
    assert MPControl.is_dask()

    from inferelator.regression import bayes_stats
    DaskController = MPControl.client

    def regression_maker(j, x, y, pp, weights):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)
        data = bayes_stats.bbsr(x, utils.scale_vector(y), pp[j, :].flatten(),
                                weights[j, :].flatten(), nS)
        data['ind'] = j
        return j, data

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X.values],
                                                broadcast=True,
                                                hash=False)
    [scatter_pp] = DaskController.client.scatter([pp_mat.values],
                                                 broadcast=True,
                                                 hash=False)
    [scatter_weights] = DaskController.client.scatter([weights_mat.values],
                                                      broadcast=True,
                                                      hash=False)

    # Wait for scattering to finish before creating futures
    distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)
    distributed.wait(scatter_pp, timeout=DASK_SCATTER_TIMEOUT)
    distributed.wait(scatter_weights, timeout=DASK_SCATTER_TIMEOUT)

    future_list = [
        DaskController.client.submit(
            regression_maker, i, scatter_x,
            Y.get_gene_data(i, force_dense=True, flatten=True), scatter_pp,
            scatter_weights) for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    result_list = process_futures_into_list(future_list)

    DaskController.client.cancel(scatter_x)
    DaskController.client.cancel(scatter_pp)
    DaskController.client.cancel(scatter_weights)

    return result_list
def lasso_stars_regress_dask(X, Y, alphas, num_subsamples, random_seed, method,
                             params, G, genes):
    """
    Execute regression (LASSO-StARS)

    :return: list
        Returns a list of regression results that the pileup_data can process
    """
    assert MPControl.is_dask()

    from inferelator.regression import stability_selection
    DaskController = MPControl.client

    def regression_maker(j, x, y):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)
        data = stability_selection.stars_model_select(
            x,
            utils.scale_vector(y),
            alphas,
            num_subsamples=num_subsamples,
            method=method,
            random_seed=random_seed,
            **params)
        data['ind'] = j
        return j, data

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X.values],
                                                broadcast=True,
                                                hash=False)

    # Wait for scattering to finish before creating futures
    distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)

    future_list = [
        DaskController.client.submit(
            regression_maker, i, scatter_x,
            Y.get_gene_data(i, force_dense=True, flatten=True))
        for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    result_list = process_futures_into_list(future_list)

    DaskController.client.cancel(scatter_x)

    return result_list
Пример #8
0
def mutual_information(X, Y, bins, logtype=DEFAULT_LOG_TYPE, temp_dir=None):
    """
    Calculate the mutual information matrix between two data matrices, where the columns are equivalent conditions

    :param X: pd.DataFrame (m1 x n)
        The data from m1 variables across n conditions
    :param Y: pd.DataFrame (m2 x n)
        The data from m2 variables across n conditions
    :param bins: int
        Number of bins to discretize continuous data into for the generation of a contingency table
    :param logtype: np.log func
        Which type of log function should be used (log2 results in MI bits, log results in MI nats, log10... is weird)
    :param temp_dir: path
        Path to write temp files for multiprocessing

    :return mi: pd.DataFrame (m1 x m2)
        The mutual information between variables m1 and m2
    """

    assert check.indexes_align((X.columns, Y.columns))

    # Create dense output matrix and copy the inputs
    mi_r = X.index
    mi_c = Y.index

    X = X.values
    Y = Y.values

    # Discretize the input matrixes
    X = _make_array_discrete(X.transpose(), bins, axis=0)
    Y = _make_array_discrete(Y.transpose(), bins, axis=0)

    # Build the MI matrix
    if MPControl.is_dask():
        from inferelator.distributed.dask_functions import build_mi_array_dask
        return pd.DataFrame(build_mi_array_dask(X, Y, bins, logtype=logtype),
                            index=mi_r,
                            columns=mi_c)
    else:
        return pd.DataFrame(build_mi_array(X,
                                           Y,
                                           bins,
                                           logtype=logtype,
                                           temp_dir=temp_dir),
                            index=mi_r,
                            columns=mi_c)
    def regress(self):
        """
        Execute multitask (AMUSR)
        :return: list
            Returns a list of regression results that the amusr_regression pileup_data can process
        """

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import amusr_regress_dask
            return amusr_regress_dask(
                self.X,
                self.Y,
                self.priors,
                self.prior_weight,
                self.n_tasks,
                self.genes,
                self.tfs,
                self.G,
                remove_autoregulation=self.remove_autoregulation)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(
                gn=self.genes[j], i=j, total=self.G),
                                 level=level)

            gene = self.genes[j]
            x, y, tasks = [], [], []

            if self.remove_autoregulation:
                tfs = [t for t in self.tfs if t != gene]
            else:
                tfs = self.tfs

            for k in range(self.n_tasks):
                if gene in self.Y[k]:
                    x.append(self.X[k].loc[:, tfs].values)  # list([N, K])
                    y.append(self.Y[k].loc[:, gene].values.reshape(
                        -1, 1))  # list([N, 1])
                    tasks.append(k)  # [T,]

            prior = format_prior(self.priors, gene, tasks, self.prior_weight)
            return run_regression_EBIC(x, y, tfs, tasks, gene, prior)

        return MPControl.map(regression_maker, range(self.G))
def sklearn_regress_dask(X, Y, model, G, genes, min_coef):
    """
    Execute regression (SKLearn)

    :return: list
        Returns a list of regression results that the pileup_data can process
    """
    assert MPControl.is_dask()

    from inferelator.regression import sklearn_regression
    DaskController = MPControl.client

    def regression_maker(j, x, y):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)
        data = sklearn_regression.sklearn_gene(x, utils.scale_vector(y),
                                               copy.copy(model))
        data['ind'] = j
        return j, data

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X.values],
                                                broadcast=True,
                                                hash=False)

    # Wait for scattering to finish before creating futures
    distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)

    future_list = [
        DaskController.client.submit(
            regression_maker, i, scatter_x,
            Y.get_gene_data(i, force_dense=True, flatten=True))
        for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    result_list = process_futures_into_list(future_list)

    DaskController.client.cancel(scatter_x)

    return result_list
Пример #11
0
    def regress(self):
        """
        Execute Elastic Net

        :return: list
            Returns a list of regression results that base_regression's pileup_data can process
        """

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import elasticnet_regress_dask
            return elasticnet_regress_dask(self.X, self.Y, self.params, self.G, self.genes)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=self.genes[j], i=j, total=self.G), level=level)
            data = elastic_net(self.X.values, self.Y.iloc[j, :].values, self.params)
            data['ind'] = j
            return data

        return MPControl.map(regression_maker, range(self.G), tell_children=False)
def dask_map(func, *args, **kwargs):
    """
    Dask map

    :param func: function to map
    :type func: callable
    :param args: positional arguments for func
    :type args: iterable
    :param kwargs: keyword (non-iterable) arguments for func. Keywords will be passed to dask client.map.
    :return: List of mapped results
    :rtype: list
    """

    assert MPControl.is_dask()

    def _func_caller(f, i, *a, **k):
        return i, f(*a, **k)

    return process_futures_into_list([
        MPControl.client.client.submit(_func_caller, func, i, *za, **kwargs)
        for i, za in enumerate(zip(*args))
    ])
    def regress(self, regression_function=None):
        """
        Execute multitask (AMUSR)
        :return: list
            Returns a list of regression results that the amusr_regression pileup_data can process
        """

        regression_function = self.regression_function if regression_function is None else regression_function

        if MPControl.is_dask():
            from inferelator.distributed.dask_functions import amusr_regress_dask
            return amusr_regress_dask(
                self.X,
                self.Y,
                self.priors,
                self.prior_weight,
                self.n_tasks,
                self.genes,
                self.tfs,
                self.G,
                remove_autoregulation=self.remove_autoregulation,
                regression_function=regression_function,
                tol=self.tol,
                rel_tol=self.rel_tol,
                use_numba=self.use_numba)

        def regression_maker(j):
            level = 0 if j % 100 == 0 else 2
            utils.Debug.allprint(base_regression.PROGRESS_STR.format(
                gn=self.genes[j], i=j, total=self.G),
                                 level=level)

            gene = self.genes[j]
            x, y, tasks = [], [], []

            if self.remove_autoregulation:
                tfs = [t for t in self.tfs if t != gene]
            else:
                tfs = self.tfs

            for k in range(self.n_tasks):
                if gene in self.Y[k].gene_names:
                    x.append(self.X[k].get_gene_data(tfs))  # list([N, K])
                    y.append(self.Y[k].get_gene_data(
                        gene, force_dense=True).reshape(-1, 1))  # list([N, 1])
                    tasks.append(k)  # [T,]

            prior = format_prior(self.priors,
                                 gene,
                                 tasks,
                                 self.prior_weight,
                                 tfs=tfs)
            return regression_function(x,
                                       y,
                                       tfs,
                                       tasks,
                                       gene,
                                       prior,
                                       Cs=self.Cs,
                                       Ss=self.Ss,
                                       lambda_Bs=self.lambda_Bs,
                                       lambda_Ss=self.lambda_Ss,
                                       tol=self.tol,
                                       rel_tol=self.rel_tol,
                                       use_numba=self.use_numba)

        return MPControl.map(regression_maker, range(self.G))
def build_mi_array_dask(X, Y, bins, logtype):
    """
    Calculate MI into an array with dask (the naive map is very inefficient)

    :param X: np.ndarray (n x m1)
        Discrete array of bins
    :param Y: np.ndarray (n x m2)
        Discrete array of bins
    :param bins: int
        The total number of bins that were used to make the arrays discrete
    :param logtype: np.log func
        Which log function to use (log2 gives bits, ln gives nats)
    :return mi: np.ndarray (m1 x m2)
        Returns the mutual information array
    """

    assert MPControl.is_dask()

    from inferelator.regression.mi import _calc_mi, _make_table, _make_discrete

    # Get a reference to the Dask controller
    DaskController = MPControl.client

    m1, m2 = X.shape[1], Y.shape[1]

    def mi_make(i, x, y):
        x = _make_discrete(x, bins)
        return i, [
            _calc_mi(_make_table(x, y[:, j], bins), logtype=logtype)
            for j in range(m2)
        ]

    # Scatter Y to workers and keep track as Futures
    [scatter_y] = DaskController.client.scatter([Y],
                                                broadcast=True,
                                                hash=False)

    # Wait for scattering to finish before creating futures
    distributed.wait(scatter_y, timeout=DASK_SCATTER_TIMEOUT)

    # Build an asynchronous list of Futures for each calculation of mi_make
    future_list = [
        DaskController.client.submit(
            mi_make, i,
            X[:, i].A.flatten() if sps.isspmatrix(X) else X[:, i].flatten(),
            scatter_y) for i in range(m1)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    mi_list = process_futures_into_list(future_list)

    # Convert the list of lists to an array
    mi = np.array(mi_list)
    assert (
        m1,
        m2) == mi.shape, "Array {sh} produced [({m1}, {m2}) expected]".format(
            sh=mi.shape, m1=m1, m2=m2)

    DaskController.client.cancel(scatter_y)

    return mi
def amusr_regress_dask(X,
                       Y,
                       priors,
                       prior_weight,
                       n_tasks,
                       genes,
                       tfs,
                       G,
                       remove_autoregulation=True,
                       lambda_Bs=None,
                       lambda_Ss=None,
                       Cs=None,
                       Ss=None,
                       regression_function=None,
                       tol=None,
                       rel_tol=None,
                       use_numba=False):
    """
    Execute multitask (AMUSR)

    :return: list
        Returns a list of regression results that the amusr_regression pileup_data can process
    """

    assert MPControl.is_dask()

    from inferelator.regression.amusr_regression import format_prior, run_regression_EBIC
    DaskController = MPControl.client

    # Allows injecting a regression function for testing
    regression_function = run_regression_EBIC if regression_function is None else regression_function

    # Gets genes, n_tasks, prior_weight, and remove_autoregulation from regress_dask()
    # Other arguments are passed in
    def regression_maker(j, x_df, y_list, prior, tf):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)

        gene = genes[j]
        x, y, tasks = [], [], []

        if remove_autoregulation:
            tf = [t for t in tf if t != gene]
        else:
            pass

        for k, y_data in y_list:
            x.append(x_df[k].get_gene_data(tf))  # list([N, K])
            y.append(y_data)
            tasks.append(k)  # [T,]

        prior = format_prior(prior, gene, tasks, prior_weight, tfs=tf)
        return j, regression_function(x,
                                      y,
                                      tf,
                                      tasks,
                                      gene,
                                      prior,
                                      lambda_Bs=lambda_Bs,
                                      lambda_Ss=lambda_Ss,
                                      Cs=Cs,
                                      Ss=Ss,
                                      tol=tol,
                                      rel_tol=rel_tol,
                                      use_numba=use_numba)

    def response_maker(y_df, i):
        y = []
        gene = genes[i]
        for k in range(n_tasks):
            if gene in y_df[k].gene_names:
                y.append(
                    (k, y_df[k].get_gene_data(gene,
                                              force_dense=True).reshape(-1,
                                                                        1)))
        return y

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X],
                                                broadcast=True,
                                                hash=False)
    [scatter_priors] = DaskController.client.scatter([priors],
                                                     broadcast=True,
                                                     hash=False)

    # Wait for scattering to finish before creating futures
    distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)
    distributed.wait(scatter_priors, timeout=DASK_SCATTER_TIMEOUT)

    future_list = [
        DaskController.client.submit(regression_maker, i, scatter_x,
                                     response_maker(Y, i), scatter_priors, tfs)
        for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    result_list = process_futures_into_list(future_list)

    DaskController.client.cancel(scatter_x)
    DaskController.client.cancel(scatter_priors)
    DaskController.client.restart()

    return result_list
def amusr_regress_dask(X,
                       Y,
                       priors,
                       prior_weight,
                       n_tasks,
                       genes,
                       tfs,
                       G,
                       remove_autoregulation=True,
                       is_restart=False):
    """
    Execute multitask (AMUSR)

    :return: list
        Returns a list of regression results that the amusr_regression pileup_data can process
    """

    assert MPControl.is_dask()

    from inferelator.regression.amusr_regression import format_prior, run_regression_EBIC
    DaskController = MPControl.client

    # Gets genes, n_tasks, prior_weight, and remove_autoregulation from regress_dask()
    # Other arguments are passed in
    def regression_maker(j, x_df, y_list, prior, tf):
        level = 0 if j % 100 == 0 else 2
        utils.Debug.allprint(base_regression.PROGRESS_STR.format(gn=genes[j],
                                                                 i=j,
                                                                 total=G),
                             level=level)

        gene = genes[j]
        x, y, tasks = [], [], []

        if remove_autoregulation:
            tf = [t for t in tf if t != gene]
        else:
            pass

        for k, y_data in y_list:
            x.append(x_df[k].loc[:, tf].values)  # list([N, K])
            y.append(y_data)
            tasks.append(k)  # [T,]

        del y_list
        prior = format_prior(prior, gene, tasks, prior_weight)
        return j, run_regression_EBIC(x, y, tf, tasks, gene, prior)

    def response_maker(y_df, i):
        y = []
        gene = genes[i]
        for k in range(n_tasks):
            if gene in y_df[k]:
                y.append((k, y_df[k].loc[:, gene].values.reshape(-1, 1)))
        return y

    # Scatter common data to workers
    [scatter_x] = DaskController.client.scatter([X], broadcast=True)
    [scatter_priors] = DaskController.client.scatter([priors], broadcast=True)

    # Wait for scattering to finish before creating futures
    try:
        distributed.wait(scatter_x, timeout=DASK_SCATTER_TIMEOUT)
        distributed.wait(scatter_priors, timeout=DASK_SCATTER_TIMEOUT)
    except distributed.TimeoutError:
        utils.Debug.vprint(
            "Scattering timeout during regression. Dask workers may be sick",
            level=0)

    future_list = [
        DaskController.client.submit(regression_maker, i, scatter_x,
                                     response_maker(Y, i), scatter_priors, tfs)
        for i in range(G)
    ]

    # Collect results as they finish instead of waiting for all workers to be done
    try:
        result_list = process_futures_into_list(future_list)
    except KeyError:
        utils.Debug.vprint("Unrecoverable job error; restarting")
        if not is_restart:
            return amusr_regress_dask(
                X,
                Y,
                priors,
                prior_weight,
                n_tasks,
                genes,
                tfs,
                G,
                remove_autoregulation=remove_autoregulation,
                is_restart=True)
        else:
            raise

    DaskController.client.cancel(scatter_x)
    DaskController.client.cancel(scatter_priors)

    return result_list