def __fix_bad_data(self, data: GeoData): max_replicates = 6 # limit data to most important because we don't have # enough memory to run the R methods with full data if len(data.control_array) < len(data.genes): # some experiments need to be transposed control = np.array(data.control_array).T.tolist() perturbed = np.array(data.perturbed_array).T.tolist() else: control = data.control_array perturbed = data.perturbed_array # for control pick the first replicates control = [x[:max_replicates] for x in control] # for perturbed pick the last replicates perturbed = [x[-max_replicates:] for x in perturbed] # the above pick was done to favorize timeseries experiments control = Utils.log_if_necessary(np.array(control)) perturbed = Utils.log_if_necessary(np.array(perturbed)) control = Utils.quantile_normalize(pd.DataFrame(control)) perturbed = Utils.quantile_normalize(pd.DataFrame(perturbed)) data.control_array = control.to_numpy().tolist() data.perturbed_array = perturbed.to_numpy().tolist() return data
def filter_data(self, logger, data): np_data = np.array(data) if np.isnan(np_data): logger.warning("Bad data, we need to fix NAN and Inf") np_data = np.nan_to_num(np_data, nan=0.0, posinf=99999.0, neginf=-99999.0) np_data = Utils.log_if_necessary(np_data.T) if np.isnan(np_data).any(): logger.error("Bad data, not log") return False pd_data = pd.DataFrame(np_data) pd_data_q = Utils.quantile_normalize(pd_data) if np.isnan(pd_data_q.to_numpy()).any(): logger.error("Bad data, bad normalization") return False return pd_data_q