예제 #1
0
 def test_normalizer(self):
     data = np.ones([3, 5])
     data[0, 2] = 9
     Norm = Normalizer(factorFn=np.sum, activations=[np.exp, np.log])
     Norm.fit(data)
     # print(data)
     data_df = pd.DataFrame(
         data,
         index=['r' + str(ii) for ii in range(data.shape[0])],
         columns=['c' + str(ii) for ii in range(data.shape[1])])
     data_norm = Norm.transform(data_df)
예제 #2
0
    def __init__(self,
                 n_cores=4,
                 predictorLimit=10,
                 preproc='log_or_exp',
                 runDir=os.path.join(tempfile.gettempdir(), 'run'),
                 seed=0,
                 **NN_params):
        self._maxcores = n_cores
        self.predictorLimit = predictorLimit
        self.norm = Normalizer.fromName(preproc)
        self.runDir = runDir
        self.seed = seed
        self.NN_params = NN_params
        self.seed = seed
        self.NN_params['seed'] = seed

        if 'dims' not in self.NN_params.keys():
            self.NN_params['dims'] = [20, 500]
예제 #3
0
    def predict(self, data, imputed_only=False, policy="restore"):
        print("Starting prediction")
        df = pd.DataFrame(data)
        normalizer = Normalizer.fromName(self.norm)
        """ Create memory chunk and put the matrix in it """
        idx, cols = df.index, df.columns
        df_norm = normalizer.fit(df).transform(df)
        """ Parallelize process with shared array """
        childJobs = [((12, 15), net.__dict__, (idx, cols), "predict")
                     for net in self.networks]

        output_dicts = self._runOnMultipleCores(self.maxcores,
                                                df_norm.values.flatten(),
                                                childJobs)

        Y_imputed = pd.concat(output_dicts, axis=1)
        Y_imputed = Y_imputed.groupby(by=Y_imputed.columns, axis=1).mean()

        Y_imputed = Y_imputed.mask(Y_imputed > df_norm.values.max(),
                                   df_norm[Y_imputed.columns])

        Y_imputed = normalizer.transform(Y_imputed, rev=True)

        Y_not_imputed = df.drop(Y_imputed.columns, axis=1)

        Y_total = pd.concat([Y_imputed, Y_not_imputed], axis=1)[df.columns]

        if policy == "restore":
            Y_total = Y_total.mask(df > 0, df)
        elif policy == "max":
            Y_total = pd.concat([Y_total, df]).max(level=0)
        else:
            Y_total = Y_total.mask(Y_total == 0, df)

        if imputed_only:
            Y_total = Y_total[Y_imputed.columns]

        if type(data) == type(pd.DataFrame()):
            return Y_total
        else:
            return Y_total.values
예제 #4
0
    def __init__(self,
                 n_cores=4,
                 predictorLimit=10,
                 preproc="log_or_exp",
                 runDir=os.path.join(tempfile.gettempdir(), "run"),
                 seed=0,
                 **NN_params):
        self._maxcores = n_cores
        self.predictorLimit = predictorLimit
        self.inOutGenes = None
        self.norm = Normalizer.fromName(preproc)
        self.runDir = runDir
        self.seed = seed

        NN_params["seed"] = seed
        if "dims" not in NN_params.keys():
            NN_params["dims"] = [20, 500]
        self.NN_params = NN_params
        self.trainingParams = None

        self._minExpressionLevel = NN_params[
            'minExpressionLevel'] if 'minExpressionLevel' in NN_params else 5
예제 #5
0
    def fit(self,
            data,
            NN_lim="auto",
            cell_subset=None,
            NN_genes=None,
            retrieve_training=False):
        np.random.seed(seed=self.seed)
        targetGeneNames = NN_genes

        inputExpressionMatrixDF = pd.DataFrame(data)
        print("Input dataset is {} genes (columns) and {} cells (rows)".format(
            inputExpressionMatrixDF.shape[1],
            inputExpressionMatrixDF.shape[0]))
        print("First 3 rows and columns:")
        print(inputExpressionMatrixDF.iloc[0:3, 0:3])

        self._setIDandRundir(inputExpressionMatrixDF)

        # Change the output dimension if the data has too few genes
        if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]:
            self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1]

        subnetOutputColumns = self.NN_params["dims"][1]

        # Choose genes to impute
        # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False)
        geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values(
            ascending=False)

        if targetGeneNames is None:
            targetGeneNames = _get_target_genes(
                geneQuantiles,
                minExpressionLevel=self._minExpressionLevel,
                maxNumOfGenes=NN_lim)

        df_to_impute = inputExpressionMatrixDF[targetGeneNames]

        numberOfTargetGenes = len(targetGeneNames)
        if (numberOfTargetGenes == 0):
            raise Exception(
                "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel."
            )

        n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes)

        # ------------------------# Subnetworks #------------------------#

        n_choose = int(numberOfTargetGenes / subnetOutputColumns)

        subGenelists = np.random.choice(targetGeneNames,
                                        [n_choose, subnetOutputColumns],
                                        replace=False).tolist()

        if n_choose < n_runs:
            # Special case: for the last run, the output layer will have previous targets
            selectedGenes = np.reshape(subGenelists, -1)
            leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes)

            fill_genes = np.random.choice(targetGeneNames,
                                          subnetOutputColumns -
                                          len(leftOutGenes),
                                          replace=False)

            subGenelists.append(
                np.concatenate([leftOutGenes, fill_genes]).tolist())

        # ------------------------# Extracting input genes #------------------------#

        corrMatrix = 1 - np.abs(
            pd.DataFrame(np.corrcoef(df_to_impute.T),
                         index=targetGeneNames,
                         columns=targetGeneNames))

        if self.inOutGenes is None:

            self.inOutGenes = get_input_genes(
                df_to_impute,
                self.NN_params["dims"],
                distanceMatrix=corrMatrix,
                targets=subGenelists,
                #predictorDropoutLimit=self.predictorDropoutLimit
            )

        # ------------------------# Subsets for fitting #------------------------#

        n_cells = df_to_impute.shape[0]

        if type(cell_subset) is float or cell_subset == 1:
            n_cells = int(cell_subset * n_cells)

        elif type(cell_subset) is int:
            n_cells = cell_subset

        self.trainCells = df_to_impute.sample(n_cells, replace=False).index

        print(
            "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)."
            .format(
                n_cells,
                1. * n_cells / df_to_impute.shape[0],
                n_cores,
                self.NN_params["n_cores"],
            ))

        if self.trainingParams is None:
            self.trainingParams = [self.NN_params] * len(self.inOutGenes)

        # -------------------# Preprocessing (if any) #--------------------#

        normalizer = Normalizer.fromName(self.norm)

        df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute)

        # -------------------# Share matrix between subprocesses #--------------------#
        """ Create memory chunk and put the matrix in it """
        idx, cols = self.trainCells, df_to_impute.columns
        trainData = df_to_impute.loc[self.trainCells, :].values
        """ Parallelize process with shared array """
        childJobs = [(in_out, trainingParams, (idx, cols), "train",
                      retrieve_training) for in_out, trainingParams in zip(
                          self.inOutGenes, self.trainingParams)]

        self.trainingParams = self._runOnMultipleCores(n_cores,
                                                       trainData.flatten(),
                                                       childJobs)

        self.networks = []
        for dictionnary in self.trainingParams:
            self.networks.append(Net(**dictionnary))

        print('---- Hyperparameters summary ----')
        self.networks[0].display_params()

        return self