예제 #1
0
    def __fix_bad_data(self, data: GeoData):
        max_replicates = 6
        # limit data to most important because we don't have
        # enough memory to run the R methods with full data
        if len(data.control_array) < len(data.genes):
            # some experiments need to be transposed
            control = np.array(data.control_array).T.tolist()
            perturbed = np.array(data.perturbed_array).T.tolist()
        else:
            control = data.control_array
            perturbed = data.perturbed_array
        # for control pick the first replicates
        control = [x[:max_replicates] for x in control]
        # for perturbed pick the last replicates
        perturbed = [x[-max_replicates:] for x in perturbed]
        # the above pick was done to favorize timeseries experiments
        control = Utils.log_if_necessary(np.array(control))
        perturbed = Utils.log_if_necessary(np.array(perturbed))

        control = Utils.quantile_normalize(pd.DataFrame(control))
        perturbed = Utils.quantile_normalize(pd.DataFrame(perturbed))

        data.control_array = control.to_numpy().tolist()
        data.perturbed_array = perturbed.to_numpy().tolist()
        return data
예제 #2
0
    def filter_data(self, logger, data):
        np_data = np.array(data)
        if np.isnan(np_data):
            logger.warning("Bad data, we need to fix NAN and Inf")
        np_data = np.nan_to_num(np_data,
                                nan=0.0,
                                posinf=99999.0,
                                neginf=-99999.0)
        np_data = Utils.log_if_necessary(np_data.T)

        if np.isnan(np_data).any():
            logger.error("Bad data, not log")
            return False

        pd_data = pd.DataFrame(np_data)
        pd_data_q = Utils.quantile_normalize(pd_data)
        if np.isnan(pd_data_q.to_numpy()).any():
            logger.error("Bad data, bad normalization")
            return False
        return pd_data_q