예제 #1
0
    def validate_data(self, check_prior=True, check_gold_standard=True):
        """
        Make sure that the data that's loaded is acceptable
        """

        if check_prior:
            # Create a null prior if the flag is set
            if self.use_no_prior and self.priors_data is not None:
                warnings.warn("The use_no_prior flag will be ignored because prior data exists")
            elif self.use_no_prior:
                Debug.vprint("A null prior is has been created", level=0)
                self.priors_data = self._create_null_prior(self._gene_names, self.tf_names)

        if check_gold_standard:
            # Create a null gold standard if the flag is set
            if self.use_no_gold_standard and self.gold_standard is not None:
                warnings.warn("The use_no_gold_standard flag will be ignored because gold standard data exists")
            elif self.use_no_gold_standard:
                Debug.vprint("A null gold standard has been created", level=0)
                self.gold_standard = self._create_null_prior(self._gene_names, self.tf_names)
            elif self.gold_standard is None:
                _msg = "No gold standard found. Model scoring will be invalid. "
                _msg += "Set worker.set_network_data_flags(use_no_gold_standard=True) to explicitly continue."
                raise ValueError(_msg)

        if check_prior and check_gold_standard:
            # Validate that some network information exists and has been loaded
            if self.priors_data is None and self.gold_standard is None:
                raise ValueError("No gold standard or priors have been provided")
예제 #2
0
    def validate_data(self):
        """
        Make sure that the data that's loaded is acceptable
        """

        # Create a null prior if the flag is set
        if self.use_no_prior and self.priors_data is not None:
            warnings.warn(
                "The use_no_prior flag will be ignored because prior data exists"
            )
        elif self.use_no_prior:
            Debug.vprint("A null prior is has been created", level=0)
            self.priors_data = self._create_null_prior(self.data.gene_names,
                                                       self.tf_names)

        # Create a null gold standard if the flag is set
        if self.use_no_gold_standard and self.gold_standard is not None:
            warnings.warn(
                "The use_no_gold_standard flag will be ignored because gold standard data exists"
            )
        elif self.use_no_gold_standard:
            Debug.vprint("A null gold standard has been created", level=0)
            self.gold_standard = self._create_null_prior(
                self.data.gene_names, self.tf_names)

        # Validate that some network information exists and has been loaded
        if self.priors_data is None and self.gold_standard is None:
            raise ValueError("No gold standard or priors have been provided")
예제 #3
0
    def load_activity(self, file=None, file_type=None):

        file = self._tfa_input_file if file is None else file
        file_type = self._tfa_input_file_type if file_type is None else file_type

        loader = InferelatorDataLoader(
            input_dir=self.input_dir,
            file_format_settings=self._file_format_settings)

        if file_type.lower() == "h5ad":
            self.design = loader.load_data_h5ad(file)
        elif self._expression_loader.lower() == "tsv":
            self.design = loader.load_data_tsv(file)

        Debug.vprint("Loaded {f} as design matrix {d}".format(
            d=self.design.shape, f=file),
                     level=1)

        self.design.trim_genes(
            remove_constant_genes=False,
            trim_gene_list=self.design.gene_names.intersection(self.tf_names))

        Debug.vprint("Trimmed to {d} for TF activity".format(
            d=self.design.shape, f=file),
                     level=1)

        assert check.indexes_align(
            [self.design.sample_names, self.response.sample_names])
    def load_velocity(self, velocity_file=None, loader_type=None):

        velocity_file = self._velocity_file_name if velocity_file is None else velocity_file
        loader_type = self._velocity_file_type if loader_type is None else loader_type
        transpose = not self.expression_matrix_columns_are_genes

        loader = InferelatorDataLoader(
            input_dir=self.input_dir,
            file_format_settings=self._file_format_settings)
        Debug.vprint("Loading velocity data from {f}".format(f=velocity_file),
                     level=1)

        if loader_type == _TSV or loader_type is None:
            self._velocity_data = loader.load_data_tsv(
                velocity_file, transpose_expression_data=transpose)

        elif loader_type == _H5AD:
            self._velocity_data = loader.load_data_h5ad(
                velocity_file, use_layer=self._velocity_h5_layer)

        elif loader_type == _HDF5:
            self._velocity_data = loader.load_data_hdf5(
                velocity_file,
                transpose_expression_data=transpose,
                use_layer=self._velocity_h5_layer)
        else:
            raise ValueError(
                "Invalid velocity_file_type: {a}".format(a=loader_type))

        self._velocity_data.name = "Velocity"
예제 #5
0
    def read_priors(self, priors_file=None, gold_standard_file=None):
        """
        Read in the priors and gold standard files
        """

        priors_file = priors_file if priors_file is not None else self.priors_file
        gold_standard_file = gold_standard_file if gold_standard_file is not None else self.gold_standard_file

        loader = InferelatorDataLoader(input_dir=self.input_dir, file_format_settings=self._file_format_settings)

        if priors_file is not None:

            Debug.vprint("Loading prior data from file {file}".format(file=priors_file), level=1)
            self.priors_data = loader.input_dataframe(priors_file)

            # Print debug info & check prior for duplicate indices (which will raise errors later)
            self.loaded_file_info("Priors data", self.priors_data)
            self._check_network_labels_unique("Priors_data", priors_file, self.priors_data)

        if gold_standard_file is not None:

            Debug.vprint("Loading gold_standard data from file {file}".format(file=gold_standard_file), level=1)
            self.gold_standard = loader.input_dataframe(gold_standard_file)

            # Print debug info & check gold standard for duplicate indices (which will raise errors later)
            self.loaded_file_info("Gold standard", self.gold_standard)
            self._check_network_labels_unique("Gold standard", gold_standard_file, self.gold_standard)
def make_data_noisy(data, random_seed=42):
    """
    Generate a new data object of random data which matches the provided data

    :param data: Raw read data
    :type data: InferelatorData
    :param random_seed: Random seed for data generation
    :type random_seed: int
    :return: Simulated data
    :rtype: InferelatorData
    """

    # Calculate probability vector for gene expression
    # Discrete sampling for count data

    sample_counts = data.sample_counts

    if data._is_integer:

        Debug.vprint("Simulating integer count data for {n} samples".format(
            n=data.num_obs),
                     level=0)

        # Data is centered already
        if np.any(sample_counts <= 0.):
            p_vec = np.ones(data.num_genes, dtype=float)

        # Normalize to mean counts per sample and sum counts per gene by matrix multiplication
        else:
            p_vec = (np.mean(sample_counts) / sample_counts).reshape(
                1, -1) @ data.expression_data

        # Flatten and convert counts to a probability vector
        p_vec = p_vec.flatten()
        p_vec = p_vec / p_vec.sum()

        data.expression_data = _sim_ints(p_vec,
                                         sample_counts,
                                         sparse=data.is_sparse,
                                         random_seed=random_seed)

    else:

        # Data is centered already
        if np.any(sample_counts <= 0.):
            p_vec = np.zeros(data.num_genes, dtype=float)

        # Normalize to mean total measured values per sample and sum counts per gene by matrix multiplication
        else:
            p_vec = (np.mean(sample_counts) / sample_counts).reshape(
                1, -1) @ data.expression_data
            p_vec /= data.num_obs

        Debug.vprint(
            "Simulating float data for {n} samples".format(n=data.num_obs),
            level=0)
        data.expression_data = _sim_float(p_vec.flatten(),
                                          data.gene_stdev,
                                          data.num_obs,
                                          random_seed=random_seed)
예제 #7
0
    def __init__(self, X, Y):
        """
        Create a regression object and do basic data transforms

        :param X: Expression or Activity data [N x K]
        :type X: InferelatorData
        :param Y: Response expression data [N x G]
        :type Y: InferelatorData
        """

        # Get the IDs and total count for the genes and predictors
        self.K = X.num_genes
        self.tfs = X.gene_names
        self.G = Y.num_genes
        self.genes = Y.gene_names

        # Rescale the design expression or activity data on features
        self.X = X
        self.X.zscore()

        self.Y = Y

        Debug.vprint(
            "Predictor matrix {pr} and response matrix {re} ready".format(
                pr=X.shape, re=Y.shape))
예제 #8
0
    def run_bootstrap(self, bootstrap_idx):
        betas, betas_resc = [], []

        # Select the appropriate bootstrap from each task and stash the data into X and Y
        for k in range(self._n_tasks):
            X = self._task_design[k].get_bootstrap(self._task_bootstraps[k][bootstrap_idx])
            Y = self._task_response[k].get_bootstrap(self._task_bootstraps[k][bootstrap_idx])

            # Make sure that the priors align to the expression matrix
            priors_data = self._task_priors[k].reindex(labels=self._targets, axis=0). \
                reindex(labels=self._regulators, axis=1). \
                fillna(value=0)

            if self.clr_only:
                # Create a mock prior with no information if clr_only is set
                priors_data = pd.DataFrame(0, index=priors_data.index, columns=priors_data.columns)

            MPControl.sync_processes(pref="bbsr_pre")

            Debug.vprint('Calculating MI, Background MI, and CLR Matrix', level=0)
            clr_matrix, _ = self.mi_driver().run(Y, X, return_mi=False)

            Debug.vprint('Calculating task {k} betas using BBSR'.format(k=k), level=0)
            t_beta, t_br = BBSR(X, Y, clr_matrix, priors_data,
                                prior_weight=self.prior_weight, no_prior_weight=self.no_prior_weight,
                                nS=self.bsr_feature_num).run()
            betas.append(t_beta)
            betas_resc.append(t_br)

        return betas, betas_resc
예제 #9
0
    def compute_common_data(self):
        """
        Compute common data structures like design and response matrices.
        """

        drd = self.drd_driver(
            metadata_handler=self.metadata_handler,
            return_half_tau=True) if self.drd_driver is not None else None

        # If there is no design-response driver set, use the expression data for design and response
        # Also do this if there is no usable metadata
        if drd is None or not drd.validate_run(self.data.meta_data):
            self.design, self.response, self.half_tau_response = self.data, self.data, self.data

        # Otherwise calculate the design-response ODE
        # TODO: Rewrite DRD for InferelatorData
        # TODO: This is *horrifying* as is from a memory perspective
        # TODO: Really fix this soon
        else:
            Debug.vprint('Creating design and response matrix ... ')
            drd.delTmin, drd.delTmax, drd.tau = self.delTmin, self.delTmax, self.tau

            design, response, half_tau_response = drd.run(
                self.data.to_df().T, self.data.meta_data)
            self.design = InferelatorData(design.T)
            self.response = InferelatorData(response.T)
            self.half_tau_response = InferelatorData(half_tau_response.T)

        Debug.vprint("Constructed design {d} and response {r} matrices".format(
            d=self.design.shape, r=self.response.shape),
                     level=1)

        self.data = None
예제 #10
0
    def _check_file_exists(self, file_name):
        """
        Print a warning if a file doesn't exist
        :param file_name: str
        """

        if file_name is not None and not os.path.isfile(self.input_path(file_name)):
            Debug.vprint("File {f} does not exist".format(f=file_name), level=0)
예제 #11
0
    def filter_to_gene_list(self):
        """
        Filter the priors and expression matrix to just genes in gene_metadata
        """

        Debug.vprint("Trimming expression matrix", level=1)
        self.data.trim_genes(trim_gene_list=self.gene_names)
        self.priors_data = self.prior_manager.filter_priors_to_genes(self.priors_data, self.data.gene_names)
예제 #12
0
    def trim_genes(self, remove_constant_genes=True, trim_gene_list=None):
        """
        Remove genes (columns) that are unwanted from the data set. Do this in-place.

        :param remove_constant_genes:
        :type remove_constant_genes: bool
        :param trim_gene_list: This is a list of genes to KEEP.
        :type trim_gene_list: list, pd.Series, pd.Index
        """

        keep_column_bool = np.ones((len(self._adata.var_names),), dtype=bool)

        if trim_gene_list is not None:
            keep_column_bool &= self._adata.var_names.isin(trim_gene_list)
        if "trim_gene_list" in self._adata.uns:
            keep_column_bool &= self._adata.var_names.isin(self._adata.uns["trim_gene_list"])

        list_trim = len(self._adata.var_names) - np.sum(keep_column_bool)
        comp = 0 if self._is_integer else np.finfo(self.values.dtype).eps * 10

        if remove_constant_genes:
            nz_var = self.values.max(axis=0) - self.values.min(axis=0)
            nz_var = nz_var.A.flatten() if self.is_sparse else nz_var

            if np.any(np.isnan(nz_var)):
                raise ValueError("NaN values are present in the expression matrix; unable to remove var=0 genes")

            nz_var = comp < nz_var

            keep_column_bool &= nz_var
            var_zero_trim = np.sum(nz_var)
        else:
            var_zero_trim = 0

        if np.sum(keep_column_bool) == 0:
            err_msg = "No genes remain after trimming. ({lst} removed to match list, {v} removed for var=0)"
            raise ValueError(err_msg.format(lst=list_trim, v=var_zero_trim))

        if np.sum(keep_column_bool) == self._adata.shape[1]:
            pass
        else:
            Debug.vprint("Trimming {name} matrix {sh} to {n} columns".format(name=self.name, sh=self._adata.X.shape,
                                                                             n=np.sum(keep_column_bool)),
                         level=1)

            # This explicit copy allows the original to be deallocated
            # Otherwise the GC leaves the original because the view reference keeps it alive
            # At some point it will need to copy so why not now
            self._adata = AnnData(self._adata.X[:, keep_column_bool],
                                  obs=self._adata.obs.copy(),
                                  var=self._adata.var.loc[keep_column_bool, :].copy(),
                                  dtype=self._adata.X.dtype)

            # Make sure that there's no hanging reference to the original object
            gc.collect()
예제 #13
0
 def _recalculate_design(self):
     """
     Use the TFA driver to recalculate the design matrix
     """
     self.design.convert_to_float()
     self.half_tau_response.convert_to_float()
     self.design = self.tfa_driver().compute_transcription_factor_activity(
         self.priors_data, self.design, self.half_tau_response)
     Debug.vprint("Rebuilt design matrix {d} with TF activity".format(
         d=self.design.shape),
                  level=1)
예제 #14
0
    def set_expression_file(self, tsv=None, hdf5=None, h5ad=None, tenx_path=None, mtx=None, mtx_barcode=None,
                            mtx_feature=None, h5_layer=None):
        """
        Set the type of expression data file. Current loaders include TSV, hdf5, h5ad (AnnData), and MTX sparse files.
        Only one of these loaders can be used; passing arguments for multiple loaders will raise a ValueError.

        :param tsv: A path to a TSV (or tsv.gz) file which can be loaded by pandas.read_csv()
        :type tsv: str, optional
        :param hdf5: A path to a hdf5 file which can be loaded by pandas.HDFStore
        :type hdf5: str, optional
        :param h5ad: A path to an AnnData hd5 file
        :type h5ad: str, optional
        :param tenx_path: A path to the folder containing the 10x mtx, barcode, and feature files
        :type tenx_path: Path, optional
        :param mtx: A path to an mtx file
        :type mtx: str, optional
        :param mtx_barcode: A path to a list of observation names (i.e. barcodes, etc) for the mtx file
        :type mtx_barcode: str, optional
        :param mtx_feature: A path to a list of gene names for the mtx file
        :type mtx_feature: str, optional
        :param h5_layer: The layer (in an AnnData h5) or the store key (in an hdf5) file to use.
            Defaults to using the first key.
        :type h5_layer: str, optional
        """

        nones = [tsv is None, hdf5 is None, h5ad is None, tenx_path is None, mtx is None]

        if all(nones):
            Debug.vprint("No file provided", level=0)
        elif sum(nones) != (len(nones) - 1):
            raise ValueError("Only one type of input expression file can be set")

        if tsv is not None:
            self._set_file_name("expression_matrix_file", tsv)
            self._expression_loader = _TSV
        elif hdf5 is not None:
            self._set_file_name("expression_matrix_file", hdf5)
            self._expression_loader = _HDF5
            self._h5_layer = h5_layer
        elif h5ad is not None:
            self._set_file_name("expression_matrix_file", h5ad)
            self._expression_loader = _H5AD
            self._h5_layer = h5_layer
        elif mtx is not None:
            self._check_file_exists(mtx)
            self._check_file_exists(mtx_barcode)
            self._check_file_exists(mtx_feature)
            self.expression_matrix_file = (mtx, mtx_barcode, mtx_feature)
            self._expression_loader = _MTX
        elif tenx_path is not None:
            self.expression_matrix_file = tenx_path
            self._expression_loader = _TENX
예제 #15
0
    def filter_to_gene_list(self):
        """
        Filter the priors and expression matrix to just genes in gene_metadata
        """

        # Most operations will be column-wise; change sparse type if needed here
        Debug.vprint("Preparing to trim expression matrix", level=2)
        self.data.to_csc()

        Debug.vprint("Trimming expression matrix", level=1)
        self.data.trim_genes(trim_gene_list=self.gene_names)
        self.priors_data = self.prior_manager.filter_priors_to_genes(
            self.priors_data, self.data.gene_names)
예제 #16
0
    def mi_make(i):
        level = 2 if i % 1000 == 0 else 3
        Debug.allprint("Mutual Information Calculation [{i} / {total}]".format(
            i=i, total=m1),
                       level=level)

        discrete_X = _make_discrete(
            X[:, i].A.flatten() if sps.isspmatrix(X) else X[:, i].flatten(),
            bins)
        return [
            _calc_mi(_make_table(discrete_X, Y[:, j], bins), logtype=logtype)
            for j in range(m2)
        ]
예제 #17
0
 def _get_file_name_from_attribute(self, file_name):
     """
     Check and see if a file name is an object attribute that holds a file namee
     :param file_name: str
     :return file_name: str
     """
     # Check and see if file_name is actually an object attribute holding a file name. Use that if so.
     if file_name not in self._file_format_settings:
         if hasattr(self, file_name) and getattr(self, file_name) in self._file_format_settings:
             file_name = getattr(self, file_name)
         else:
             Debug.vprint("File {f} is unknown".format(f=file_name), level=0)
             return None
     return file_name
예제 #18
0
    def run_regression(self):

        betas = [[] for _ in range(self._n_tasks)]
        rescaled_betas = [[] for _ in range(self._n_tasks)]

        for idx in range(self.num_bootstraps):
            Debug.vprint('Bootstrap {} of {}'.format((idx + 1),
                                                     self.num_bootstraps),
                         level=0)
            current_betas, current_rescaled_betas = self.run_bootstrap(idx)

            for k in range(self._n_tasks):
                betas[k].append(current_betas[k])
                rescaled_betas[k].append(current_rescaled_betas[k])

        return betas, rescaled_betas
예제 #19
0
        def get_data(self):
            """
            Load all the data and then return a list of references to TaskData objects
            There will be multiple objects returned if tasks_from_metadata is set.
            If tasks_from_metadata is not set, the list contains only this task (self)

            :return: List of TaskData objects with loaded data
            :rtype: list(TaskData)
            """
            Debug.vprint("Loading data for task {task_name}".format(task_name=self.task_name))
            super(TaskData, self).get_data()

            if self.tasks_from_metadata:
                return self.separate_tasks_by_metadata()
            else:
                return [self]
예제 #20
0
    def run_regression(self):
        betas = []
        rescaled_betas = []

        for idx, bootstrap in enumerate(self.get_bootstraps()):
            Debug.vprint('Bootstrap {} of {}'.format((idx + 1),
                                                     self.num_bootstraps),
                         level=0)
            np.random.seed(self.random_seed + idx)
            current_betas, current_rescaled_betas = self.run_bootstrap(
                bootstrap)

            betas.append(current_betas)
            rescaled_betas.append(current_rescaled_betas)

        return betas, rescaled_betas
예제 #21
0
    def print_file_loading_arguments(self, file_name):
        """
        Print the settings that will be used to load a given file name.

        :param file_name: The name of the variable containing the file name (from `set_file_properties`)
        :type file_name: str
        """

        # Check and see if file_name is actually an object attribute holding a file name. Use that if so.
        file_name = self._get_file_name_from_attribute(file_name)
        if file_name is None:
            return

        msg = "File {f} has the following settings:".format(f=file_name)
        msg += "\n\t".join([str(k) + " = " + str(v) for k, v in self._file_format_settings[file_name].items()])
        Debug.vprint(msg, level=0)
예제 #22
0
    def _check_network_labels_unique(df_name, file_name, df, raise_on_duplicate=False):

        _msg = None

        if not df.columns.is_unique:
            _repeated = df.columns[df.columns.duplicated()]
            _msg = "{name} {f}: {n} TFs are duplicated ({g})"
            Debug.vprint(_msg.format(name=df_name, f=file_name, n=len(_repeated), g=" ".join(_repeated)), level=0)

        if not df.index.is_unique:
            _repeated = df.index[df.index.duplicated()]
            _msg = "{name} {f}: {n} Genes are duplicated ({g})"
            Debug.vprint(_msg.format(name=df_name, f=file_name, n=len(_repeated), g=" ".join(_repeated)), level=0)

        if _msg is not None and raise_on_duplicate:
            raise ValueError(_msg)
예제 #23
0
        def separate_tasks_by_metadata(self, meta_data_column=None):
            """
            Take a single expression matrix and break it into multiple dataframes based on meta_data. Return a list of
            TaskData objects which have the task-specific data loaded into them

            :param meta_data_column: Meta_data column which corresponds to task ID
            :type meta_data_column: str
            :return new_task_objects: List of the TaskData objects with only one task's data each
            :rtype: list(TaskData)

            """

            if self.data is None:
                raise ValueError("No data has been loaded prior to `separate_tasks_by_metadata`")

            meta_data_column = meta_data_column if meta_data_column is not None else self.meta_data_task_column
            if meta_data_column is None:
                raise ValueError("tasks_from_metadata is set but meta_data_task_column is not")
            elif meta_data_column not in self.data.meta_data:
                msg = "meta_data_task_column is not found in task {t}".format(t=str(self))
                raise ValueError(msg)

            new_task_objects = list()
            tasks = self.data.meta_data[meta_data_column].unique().tolist()
            Debug.vprint("Creating {n} tasks from metadata column {col}".format(n=len(tasks), col=meta_data_column),
                         level=0)

            # Remove data references from self
            data = self.data
            self.data = None

            for task in tasks:
                # Copy this object
                task_obj = copy.deepcopy(self)

                # Get an index of the stuff to keep
                task_idx = data.meta_data[meta_data_column] == task

                # Reset expression matrix, metadata, and task_name in the copy
                task_obj.data = data.subset_copy(row_index=task_idx)
                task_obj.data.name = task
                task_obj.task_name = task
                new_task_objects.append(task_obj)

            Debug.vprint("Separated data into {ntask} tasks".format(ntask=len(new_task_objects)), level=0)

            return new_task_objects
예제 #24
0
    def read_tfs(self, file=None):
        """
        Read tf names file into tf_names
        """

        # Load the class variable if no file is passed
        file = self.tf_names_file if file is None else file

        if file is not None:
            Debug.vprint("Loading TF feature names from file {file}".format(file=file), level=1)
            # Read in a dataframe with no header or index
            loader = InferelatorDataLoader(input_dir=self.input_dir, file_format_settings=self._file_format_settings)
            tfs = loader.input_dataframe(file, header=None, index_col=None)

            # Cast the dataframe into a list
            assert tfs.shape[1] == 1
            self.tf_names = tfs.values.flatten().tolist()
    def _align_velocity(self):

        keep_genes = self._velocity_data.gene_names.intersection(
            self.data.gene_names)
        Debug.vprint(
            "Aligning velocity and expression data on {n} genes".format(
                n=len(keep_genes)))

        self._velocity_data.trim_genes(remove_constant_genes=False,
                                       trim_gene_list=keep_genes)
        self.data.trim_genes(remove_constant_genes=False,
                             trim_gene_list=keep_genes)

        assert check.indexes_align(
            (self._velocity_data.gene_names, self.data.gene_names))
        assert check.indexes_align(
            (self._velocity_data.sample_names, self.data.sample_names))
예제 #26
0
    def pileup_data(self, run_data):
        """
        Take the completed run data and pack it up into a DataFrame of betas

        :param run_data: list
            A list of regression result dicts ordered by gene. Each regression result should have `ind`, `pp`, `betas`
            and `betas_resc` keys with the appropriate data.
        :return betas, betas_rescale: (pd.DataFrame [G x K], pd.DataFrame [G x K])
        """

        # Create G x K arrays of 0s to populate with the regression data
        betas = np.zeros((self.G, self.K), dtype=np.dtype(float))
        betas_rescale = np.zeros((self.G, self.K), dtype=np.dtype(float))

        # Populate the zero arrays with the BBSR betas
        for data in run_data:

            # If data is None assume a null model
            if data is None:
                raise RuntimeError("No model produced by regression method")

            xidx = data['ind']  # Int
            yidx = data['pp']  # Boolean array of size K
            betas[xidx, yidx] = data['betas']
            betas_rescale[xidx, yidx] = data['betas_resc']

        d_len, b_avg, null_m = self._summary_stats(betas)
        Debug.vprint("Regression complete:", end=" ", level=0)
        Debug.vprint(
            "{d_len} Models, {b_avg} Preds per Model ({nom} Null)".format(
                d_len=d_len, b_avg=round(b_avg, 4), nom=null_m),
            level=0)

        # Convert arrays into pd.DataFrames to return results
        betas = pd.DataFrame(betas,
                             index=self.Y.gene_names,
                             columns=self.X.gene_names)
        betas_rescale = pd.DataFrame(betas_rescale,
                                     index=self.Y.gene_names,
                                     columns=self.X.gene_names)

        return betas, betas_rescale
예제 #27
0
    def run_regression(self):
        betas = []
        rescaled_betas = []

        MPControl.sync_processes("pre_regression")

        for idx, bootstrap in enumerate(self.get_bootstraps()):
            Debug.vprint('Bootstrap {} of {}'.format((idx + 1),
                                                     self.num_bootstraps),
                         level=0)
            np.random.seed(self.random_seed + idx)
            current_betas, current_rescaled_betas = self.run_bootstrap(
                bootstrap)
            if self.is_master():
                betas.append(current_betas)
                rescaled_betas.append(current_rescaled_betas)

            MPControl.sync_processes("post_bootstrap")

        return betas, rescaled_betas
예제 #28
0
    def read_genes(self, file=None):
        """
        Read gene names file into gene_names
        """

        # Load the class variable if no file is passed
        file = self.gene_names_file if file is None else file

        if file is not None:
            Debug.vprint("Loading Gene feature names from file {file}".format(file=file), level=1)
            # Read in a dataframe with no header or index
            loader = InferelatorDataLoader(input_dir=self.input_dir, file_format_settings=self._file_format_settings)
            genes = loader.input_dataframe(file, header=None, index_col=None)

            # Cast the dataframe into a list
            assert genes.shape[1] == 1
            self.gene_names = genes.values.flatten().tolist()

        # Use the gene names in the data file if no restrictive list is provided
        if self.gene_names is None and self.data is not None:
            self.gene_names = self.data.gene_names.copy()
예제 #29
0
    def set_mkl(cls, mkl=True):

        # If the MKL flag is None, don't change anything
        if mkl is None:
            pass

        # If the MKL flag is True, use the dot_product_mkl function when .dot() is called
        if mkl:
            try:
                from sparse_dot_mkl import get_version_string, dot_product_mkl as dp
                msg = "Matrix multiplication will use sparse_dot_mkl package with MKL: {m}"
                vstring = get_version_string()
                Debug.vprint(msg.format(m=vstring if vstring is not None else "Install mkl-service for details"),
                             level=2)

                cls._dot_func = dp

            # If it isn't available, use the scipy/numpy functions instead
            except ImportError as err:
                Debug.vprint("Unable to load MKL with sparse_dot_mkl:\n" + str(err), level=0)
                cls._dot_func = dot_product

        # If the MKL flag is True, use the python (numpy/scipy) functions when .dot() is called
        else:
            Debug.vprint("Matrix multiplication will use Numpy; this is not advised for sparse data", level=2)
            cls._dot_func = dot_product
    def _combine_expression_velocity(self, expression, velocity):
        """
        Calculate dX/dt + lambda * X
        :param expression:
        :param velocity:
        :return:
        """

        assert check.indexes_align(
            (expression.gene_names, velocity.gene_names))
        assert check.indexes_align(
            (expression.sample_names, velocity.sample_names))

        if self._decay_constants is not None:
            Debug.vprint("Using preloaded decay constants in _decay_constants")
            decay_constants = self._decay_constants
        elif self.tau is not None:
            Debug.vprint(
                "Calculating decay constants for tau {t}".format(t=self.tau))
            decay_constants = np.repeat(1 / self.tau, expression.num_genes)
        elif "decay_constants" in velocity.gene_data.columns and self._use_precalculated_decay_constants:
            Debug.vprint(
                "Extracting decay constants from {n}".format(n=velocity.name))
            decay_constants = velocity.gene_data["decay_constants"].values
        elif "decay_constants" in expression.gene_data.columns and self._use_precalculated_decay_constants:
            Debug.vprint("Extracting decay constants from {n}".format(
                n=expression.name))
            decay_constants = expression.gene_data["decay_constants"].values
        else:
            Debug.vprint(
                "No decay information found. Solving dX/dt = AB for Betas")
            return velocity

        x = np.multiply(expression.values, decay_constants[None, :])
        return InferelatorData(np.add(velocity.values, x),
                               gene_names=expression.gene_names,
                               sample_names=expression.sample_names,
                               meta_data=expression.meta_data)