def test_normalizer(self): data = np.ones([3, 5]) data[0, 2] = 9 Norm = Normalizer(factorFn=np.sum, activations=[np.exp, np.log]) Norm.fit(data) # print(data) data_df = pd.DataFrame( data, index=['r' + str(ii) for ii in range(data.shape[0])], columns=['c' + str(ii) for ii in range(data.shape[1])]) data_norm = Norm.transform(data_df)
def __init__(self, n_cores=4, predictorLimit=10, preproc='log_or_exp', runDir=os.path.join(tempfile.gettempdir(), 'run'), seed=0, **NN_params): self._maxcores = n_cores self.predictorLimit = predictorLimit self.norm = Normalizer.fromName(preproc) self.runDir = runDir self.seed = seed self.NN_params = NN_params self.seed = seed self.NN_params['seed'] = seed if 'dims' not in self.NN_params.keys(): self.NN_params['dims'] = [20, 500]
def predict(self, data, imputed_only=False, policy="restore"): print("Starting prediction") df = pd.DataFrame(data) normalizer = Normalizer.fromName(self.norm) """ Create memory chunk and put the matrix in it """ idx, cols = df.index, df.columns df_norm = normalizer.fit(df).transform(df) """ Parallelize process with shared array """ childJobs = [((12, 15), net.__dict__, (idx, cols), "predict") for net in self.networks] output_dicts = self._runOnMultipleCores(self.maxcores, df_norm.values.flatten(), childJobs) Y_imputed = pd.concat(output_dicts, axis=1) Y_imputed = Y_imputed.groupby(by=Y_imputed.columns, axis=1).mean() Y_imputed = Y_imputed.mask(Y_imputed > df_norm.values.max(), df_norm[Y_imputed.columns]) Y_imputed = normalizer.transform(Y_imputed, rev=True) Y_not_imputed = df.drop(Y_imputed.columns, axis=1) Y_total = pd.concat([Y_imputed, Y_not_imputed], axis=1)[df.columns] if policy == "restore": Y_total = Y_total.mask(df > 0, df) elif policy == "max": Y_total = pd.concat([Y_total, df]).max(level=0) else: Y_total = Y_total.mask(Y_total == 0, df) if imputed_only: Y_total = Y_total[Y_imputed.columns] if type(data) == type(pd.DataFrame()): return Y_total else: return Y_total.values
def __init__(self, n_cores=4, predictorLimit=10, preproc="log_or_exp", runDir=os.path.join(tempfile.gettempdir(), "run"), seed=0, **NN_params): self._maxcores = n_cores self.predictorLimit = predictorLimit self.inOutGenes = None self.norm = Normalizer.fromName(preproc) self.runDir = runDir self.seed = seed NN_params["seed"] = seed if "dims" not in NN_params.keys(): NN_params["dims"] = [20, 500] self.NN_params = NN_params self.trainingParams = None self._minExpressionLevel = NN_params[ 'minExpressionLevel'] if 'minExpressionLevel' in NN_params else 5
def fit(self, data, NN_lim="auto", cell_subset=None, NN_genes=None, retrieve_training=False): np.random.seed(seed=self.seed) targetGeneNames = NN_genes inputExpressionMatrixDF = pd.DataFrame(data) print("Input dataset is {} genes (columns) and {} cells (rows)".format( inputExpressionMatrixDF.shape[1], inputExpressionMatrixDF.shape[0])) print("First 3 rows and columns:") print(inputExpressionMatrixDF.iloc[0:3, 0:3]) self._setIDandRundir(inputExpressionMatrixDF) # Change the output dimension if the data has too few genes if inputExpressionMatrixDF.shape[1] < self.NN_params["dims"][1]: self.NN_params["dims"][1] = inputExpressionMatrixDF.shape[1] subnetOutputColumns = self.NN_params["dims"][1] # Choose genes to impute # geneCounts = inputExpressionMatrixDF.sum().sort_values(ascending=False) geneQuantiles = inputExpressionMatrixDF.quantile(.99).sort_values( ascending=False) if targetGeneNames is None: targetGeneNames = _get_target_genes( geneQuantiles, minExpressionLevel=self._minExpressionLevel, maxNumOfGenes=NN_lim) df_to_impute = inputExpressionMatrixDF[targetGeneNames] numberOfTargetGenes = len(targetGeneNames) if (numberOfTargetGenes == 0): raise Exception( "Unable to compute any target genes. Is your data log transformed? Perhaps try with a lower minExpressionLevel." ) n_runs, n_cores = self._getRunsAndCores(numberOfTargetGenes) # ------------------------# Subnetworks #------------------------# n_choose = int(numberOfTargetGenes / subnetOutputColumns) subGenelists = np.random.choice(targetGeneNames, [n_choose, subnetOutputColumns], replace=False).tolist() if n_choose < n_runs: # Special case: for the last run, the output layer will have previous targets selectedGenes = np.reshape(subGenelists, -1) leftOutGenes = np.setdiff1d(targetGeneNames, selectedGenes) fill_genes = np.random.choice(targetGeneNames, subnetOutputColumns - len(leftOutGenes), replace=False) subGenelists.append( np.concatenate([leftOutGenes, fill_genes]).tolist()) # ------------------------# Extracting input genes #------------------------# corrMatrix = 1 - np.abs( pd.DataFrame(np.corrcoef(df_to_impute.T), index=targetGeneNames, columns=targetGeneNames)) if self.inOutGenes is None: self.inOutGenes = get_input_genes( df_to_impute, self.NN_params["dims"], distanceMatrix=corrMatrix, targets=subGenelists, #predictorDropoutLimit=self.predictorDropoutLimit ) # ------------------------# Subsets for fitting #------------------------# n_cells = df_to_impute.shape[0] if type(cell_subset) is float or cell_subset == 1: n_cells = int(cell_subset * n_cells) elif type(cell_subset) is int: n_cells = cell_subset self.trainCells = df_to_impute.sample(n_cells, replace=False).index print( "Starting training with {} cells ({:.1%}) on {} threads ({} cores/thread)." .format( n_cells, 1. * n_cells / df_to_impute.shape[0], n_cores, self.NN_params["n_cores"], )) if self.trainingParams is None: self.trainingParams = [self.NN_params] * len(self.inOutGenes) # -------------------# Preprocessing (if any) #--------------------# normalizer = Normalizer.fromName(self.norm) df_to_impute = normalizer.fit(df_to_impute).transform(df_to_impute) # -------------------# Share matrix between subprocesses #--------------------# """ Create memory chunk and put the matrix in it """ idx, cols = self.trainCells, df_to_impute.columns trainData = df_to_impute.loc[self.trainCells, :].values """ Parallelize process with shared array """ childJobs = [(in_out, trainingParams, (idx, cols), "train", retrieve_training) for in_out, trainingParams in zip( self.inOutGenes, self.trainingParams)] self.trainingParams = self._runOnMultipleCores(n_cores, trainData.flatten(), childJobs) self.networks = [] for dictionnary in self.trainingParams: self.networks.append(Net(**dictionnary)) print('---- Hyperparameters summary ----') self.networks[0].display_params() return self