def insert_geo(self, data: GeoData):
     Utils.create_folder_if_not_exist(self.geo_path)
     folder = os.path.join(self.geo_path, data.source)
     Utils.create_folder_if_not_exist(folder)
     out_path = os.path.join(folder, f"{data.name}.json")
     with open(out_path, "w") as out:
         json.dump(data.get_as_dict(), out)
예제 #2
0
 def train(self):
     features_file_name = self.config.feature_file
     self.logger.info("started xgb training ")
     data = self.load_training(features_file_name)
     random.shuffle(data)
     split_point = int(len(data) * self.config.data_split)
     train_data_split = data[:split_point]
     test_data_split = data[split_point:]
     num_round = self.config.num_round
     param = self.config.param
     self.logger.info(
         f"params for training {param}, num_rounds: {num_round}")
     X_train_split, y_train_split = self.get_data(train_data_split)
     train_data = xgb.DMatrix(data=X_train_split, label=y_train_split)
     X_test_split, y_test_split = self.get_data(test_data_split)
     test_data = xgb.DMatrix(data=X_test_split, label=y_test_split)
     self.logger.info("Finished preparing data, starting training")
     bst = xgb.train(param, train_data, num_round)
     self.logger.info("Finished training, starting prediction eval")
     y_pred = bst.predict(test_data)
     predictions = [round(value) for value in y_pred]
     accuracy = accuracy_score(y_test_split, predictions)
     self.logger.info(f'Prediction accuracy: {accuracy}, saving')
     model_path = os.path.join(self.config.model_folder,
                               self.config.model_name)
     Utils.create_folder_if_not_exist(model_path)
     bst.save_model(os.path.join(model_path, "model.json"))
예제 #3
0
    def evaluate(self, group_name):
        fig = plt.figure()
        Utils.create_folder_if_not_exist(self.output_folder)
        save_path = os.path.join(self.output_folder, self.name)
        Utils.create_folder_if_not_exist(save_path)

        for validation_source in self.method_roc.keys():
            method_roc = self.method_roc[validation_source]
            methods = []
            for method_name, roc in method_roc.items():
                pred = roc['pred']
                pred = np.nan_to_num(pred, True, 0.0, 1.0, 0.0)
                fpr, tpr, _ = roc_curve(roc['y'], pred)
                score = roc_auc_score(roc['y'], pred)
                tpr = tpr - uniform.cdf(fpr)
                plt.plot(fpr, tpr)
                plt.xlabel('False Positive Rate', fontsize=16)
                plt.ylabel('True Positive Rate', fontsize=16)
                methods.append(f"{method_name} AUC {score:.4f}")
            name = f"{group_name}_{validation_source}"
            path_file = f"{name}.png"
            path_method = os.path.join(save_path, path_file)
            plt.legend(methods)
            plt.savefig(path_method)
            plt.title(f"{name}")
            plt.clf()
            fig.clf()
        self.method_roc = {}
        self.logger.info("done")
예제 #4
0
    def __fix_bad_data(self, data: GeoData):
        max_replicates = 6
        # limit data to most important because we don't have
        # enough memory to run the R methods with full data
        if len(data.control_array) < len(data.genes):
            # some experiments need to be transposed
            control = np.array(data.control_array).T.tolist()
            perturbed = np.array(data.perturbed_array).T.tolist()
        else:
            control = data.control_array
            perturbed = data.perturbed_array
        # for control pick the first replicates
        control = [x[:max_replicates] for x in control]
        # for perturbed pick the last replicates
        perturbed = [x[-max_replicates:] for x in perturbed]
        # the above pick was done to favorize timeseries experiments
        control = Utils.log_if_necessary(np.array(control))
        perturbed = Utils.log_if_necessary(np.array(perturbed))

        control = Utils.quantile_normalize(pd.DataFrame(control))
        perturbed = Utils.quantile_normalize(pd.DataFrame(perturbed))

        data.control_array = control.to_numpy().tolist()
        data.perturbed_array = perturbed.to_numpy().tolist()
        return data
예제 #5
0
 def __init__(self, config_filename):
     self.logger = Utils.get_logger('generate_and_store_silico')
     config_raw = Utils.get_config(config_filename, "SilicoData")
     self.config = GenerateSilicoConfig(**config_raw)
     self.store = Storage(config_filename)
     self.generator_manager = SilicoGeneratorsManager(config_filename)
     self.generator_manager.setup()
예제 #6
0
    def __do_columns_item(self, gse, geo_id, source_name, pf):
        iter_labels = list(self.type_labels)
        type_labels = self.type_labels
        column_entry = {}
        column_entry['all'] = iter_labels

        while (iter_labels):
            type_label = Utils.find_in_array(gse.columns, iter_labels)
            iter_labels.pop(0)

            if type_label == 'unknown':
                error_msg = f'no label geoid {geo_id} labels:{type_labels}'
                self.logger.error(error_msg)
                continue

            type_idx = gse.columns.columns.get_loc(type_label)
            gene_label = self.__get_genes(gse)

            if gene_label == 'unknown':
                self.logger.error(
                    f'no gene label for geoid {geo_id} labels:{type_labels}')
                continue

            control_series, perturbed_series = self.__split_control_perturbed(
                gse, column_entry, type_idx)

            if not control_series:
                continue

            if not perturbed_series:
                continue

            np_control_raw = gse.table[control_series].to_numpy()
            np_perturbed_raw = gse.table[perturbed_series].to_numpy()

            control = Utils.repair_nan_fast(np_control_raw)
            perturbed = Utils.repair_nan_fast(np_perturbed_raw)

            self.logger.info(f'finished {geo_id}')
            geo_data = GeoData({
                "name": geo_id,
                "genes": gene_label,
                "source": source_name,
                "perturbed_series_names": perturbed_series,
                "control_series_names": control_series,
                "extra_info": gse.metadata,
                "perturbed_array": perturbed.tolist(),
                "control_array": control.tolist(),
                "pf": pf
            })

            self.experiment_collumns[geo_id] = column_entry

            return geo_data

        error_msg = (f"could not split {geo_id} in 2 classes"
                     f"high cols: {gse.columns}")
        self.logger.error(error_msg)
        return None
 def insert_validation(self, data: GeneDiffValidation):
     Utils.create_folder_if_not_exist(self.validation_path)
     out_path = os.path.join(self.validation_path, f"{data.source}.json")
     with open(out_path, "w") as out:
         json.dump({
             "source": data.source,
             "data": data.data
         },
                   out,
                   cls=Utils.SetEncoder)
예제 #8
0
 def __init__(self, config_filename):
     self.logger = Utils.get_logger("Benchmark")
     self.config = Utils.get_config(config_filename, "Benchmark")
     self.config = BenchamarkDiffMethodsConfig(logger=self.logger,
                                               **self.config)
     self.method_manager = DiffMethodsManager(config_filename)
     self.method_manager.setup()
     self.storage = Storage(config_filename)
     self.metric_manager = MetricManager(config_filename)
     self.metric_manager.setup()
예제 #9
0
 def __init__(self, config_filename):
     self.logger = Utils.get_logger('Storage')
     config_section = Utils.get_config(config_filename, "Storage")
     self.config = StorageConfig(config_section)
     self.providers = {}
     providers = self.config.providers
     for provider_name, provider_json_config in providers.items():
         self.providers[provider_name] = Storage.create_provider(
             provider_name, provider_json_config)
     self.logger.info(f"Started storage with config {config_section}")
    def insert_method_results(self, result: GeneMethodResult, method_name: str,
                              experiment_name: str):
        Utils.create_folder_if_not_exist(self.results_path)
        method_folder = os.path.join(self.results_path, method_name)
        Utils.create_folder_if_not_exist(method_folder)
        file_output = os.path.join(method_folder, f"{experiment_name}.json")

        with open(file_output, "w") as out:
            output = result.to_dict()
            json.dump(output, out)
예제 #11
0
 def __init__(self, config_filename):
     GEOparse.set_verbosity("ERROR")
     self.config_filename = config_filename
     data_section = Utils.get_config(config_filename, 'GEOImporter')
     self.config = GEOImporterConfig(**data_section)
     self.logger = Utils.get_logger('GEOImporter')
     self.storage = Storage(config_filename)
     self.labels = self.config.labeling
     self.inputs = self.config.input_data
     self.control_labels = self.labels.control
     self.type_labels = self.labels.type
     self.gene_names = self.labels.gene_names
     self.path = self.config.data_path
     self.experiment_collumns = {}
예제 #12
0
    def __do_no_colums_item(self, gse, geo_id, source_name, pf):
        control = self.labels.no_column_control
        phenotype_data = gse.phenotype_data
        columns = phenotype_data.columns
        info_experiment_idx = columns.get_loc(self.labels.no_column_title)
        gsm_ids_idx = columns.get_loc(self.labels.no_column_accession)
        gsm_type = list(phenotype_data.values[:, info_experiment_idx])
        gsm_ids = list(phenotype_data.values[:, gsm_ids_idx])
        control_gsms = []
        perturbation_gsms = []
        raw_control_data = []
        raw_perturbed_data = []
        for idx in range(0, len(gsm_type)):
            gsm_id = gsm_ids[idx]
            table = gse.gsms[gsm_id].table
            value_idx = table.columns.get_loc('VALUE')
            values = gse.gsms[gsm_id].table.values[:, value_idx].tolist()
            if Utils.find_in_array(gsm_type[idx], control) != 'unknown':
                control_gsms.append(gsm_id)
                raw_control_data.append(values)
            else:
                perturbation_gsms.append(gsm_id)
                raw_perturbed_data.append(values)

        if not control_gsms:
            self.logger('[no col]no control for {geo_id}')
            return None

        genes = gse.gsms[control_gsms[0]].table.values[:, 0]
        np_control_raw = np.array(raw_control_data)
        np_perturbed_raw = np.array(raw_perturbed_data)

        control = Utils.repair_nan_fast(np_control_raw)
        perturbed = Utils.repair_nan_fast(np_perturbed_raw)

        self.logger.info(f'finished {geo_id}')
        geo_data = GeoData({
            "name": geo_id,
            "genes": genes.tolist(),
            "source": source_name,
            "perturbed_series_names": perturbation_gsms,
            "control_series_names": control_gsms,
            "extra_info": gse.metadata,
            "perturbed_array": perturbed.tolist(),
            "control_array": control.tolist(),
            "pf": pf
        })
        return geo_data
예제 #13
0
 def __init__(self, config, output_folder):
     self.name = "ROC"
     self.logger = Utils.get_logger("metric_roc")
     self.config = config
     self.output_folder = output_folder
     self.method_roc = {}
     pass
예제 #14
0
    def __from_attribute_matrix(self, name, dict_file_path,
                                data_file_path, base_path):

        dict_path = os.path.join(base_path, dict_file_path)
        data_path = os.path.join(base_path, data_file_path)
        collect = {}
        new_links = 0
        new_tfs = 0
        attributes = {}
        self.logger.info(f"Doing db {name}")
        attributes = self.__load_dict_attributes(dict_path)

        with open(data_path) as tsvfile:
            reader = self.__get_tsv_reader(tsvfile)
            for row in reader:
                popGene = row['GeneSym'].lower()
                for key, value in row.items():
                    if key not in attributes:
                        continue
                    value_float = float(value)
                    if Utils.isclose(value_float, 0.0):
                        continue
                    tf_name = attributes[key].lower()
                    if tf_name in collect:
                        if popGene not in collect[tf_name]:
                            collect[tf_name].add(popGene)
                            new_links = new_links + 1
                    else:
                        collect[tf_name] = set([popGene])
                        new_tfs = new_tfs + 1
        message = (f"TF db:{name }",
                   f"new tfs: {new_tfs}",
                   f"new links: {new_links}")
        self.logger.info(message)
        return collect
예제 #15
0
 def __init__(self, config, output_folder):
     self.name = "Kolmogorov"
     self.logger = Utils.get_logger("metric_kolmogorov")
     self.method_rks = {}
     self.config = config
     self.output_folder = output_folder
     pass
    def generate_validation_data(self, num_genes,
                                 num_pfs) -> GeneDiffValidation:
        gene_names = Utils.get_random_gene_names(num_genes)
        pf_names = Utils.get_random_tf_names(num_pfs)
        data = {}
        for pf in pf_names:
            num = random.randint(2, len(gene_names))
            choice_list = random.choices(gene_names, k=num)
            choice_set = set(choice_list)
            choice_list = list(choice_set)
            data[pf] = choice_list

        validation_data = GeneDiffValidation()
        validation_data.source = "silico"
        validation_data.data = data
        return validation_data
예제 #17
0
 def plot_cdf(self, data):
     x, y = Utils.ecdf(data)
     x = np.append(x, [1.0])
     y = np.append(y, [1.0])
     y = y - uniform.cdf(x)
     plt.plot(x, y)
     plt.xlabel('rank', fontsize=16)
     plt.ylabel('cdf(r)-r', fontsize=16)
예제 #18
0
def setup_default_data(config_filename):
    data_section = Utils.get_config(config_filename, 'GEOImporter')
    config = GEOImporterConfig(**data_section)
    default_data_url = 'https://github.com/raduangelescu/GeneBench/raw/main/genebench-data.7z'
    download_file = os.path.join(config.data_path, "genebench-data.7z") 
    wget.download(default_data_url, download_file)
    with py7zr.SevenZipFile(download_file, mode='r') as z:
        z.extractall(config.data_path)
    os.remove(download_file)
예제 #19
0
    def setup(self, config):
        self.config = MIDGETNeuralConfig(**config)
        logger_name = f"MIDGETNeural[{self.config.model_name}]"
        self.logger = Utils.get_logger(logger_name)

        model_path = os.path.join(self.config.output_folder,
                                  self.config.model_name, "model")
        self.logger.info(f"Loading model: {model_path}")
        self.model = tf.keras.models.load_model(model_path)
 def __init__(self, config: dict):
     self.config = FileSystemConfig(config)
     base_path = self.config.base_path
     Utils.create_folder_if_not_exist(base_path)
     self.geo_path = os.path.join(base_path, self.config.geo_folder)
     Utils.create_folder_if_not_exist(self.geo_path)
     self.validation_path = os.path.join(base_path,
                                         self.config.validation_folder)
     Utils.create_folder_if_not_exist(self.validation_path)
     self.results_path = os.path.join(base_path, self.config.results_folder)
     Utils.create_folder_if_not_exist(self.results_path)
예제 #21
0
    def evaluate(self, group_name):
        Utils.create_folder_if_not_exist(self.output_folder)
        save_path = os.path.join(self.output_folder, self.name)
        Utils.create_folder_if_not_exist(save_path)

        for validation_source in self.method_f1.keys():
            method_f1 = self.method_f1[validation_source]
            methods = []
            for method_name, _f1 in method_f1.items():
                pred = _f1['pred']
                pred = np.nan_to_num(pred, True, 0.0, 1.0, 0.0)
                f1 = f1_score(_f1['y'], pred)
                methods.append(f"{method_name} F1 Score: {f1:.4f}")
            name = f"{group_name}_{validation_source}"
            path_file = f"{name}.txt"
            path_method = os.path.join(save_path, path_file)
            with open(path_method, mode='wt', encoding='utf-8') as out_scores:
                out_scores.write('\n'.join(methods))
        self.method_f1 = {}
        self.logger.info("done")
예제 #22
0
    def filter_data(self, logger, data):
        np_data = np.array(data)
        if np.isnan(np_data):
            logger.warning("Bad data, we need to fix NAN and Inf")
        np_data = np.nan_to_num(np_data,
                                nan=0.0,
                                posinf=99999.0,
                                neginf=-99999.0)
        np_data = Utils.log_if_necessary(np_data.T)

        if np.isnan(np_data).any():
            logger.error("Bad data, not log")
            return False

        pd_data = pd.DataFrame(np_data)
        pd_data_q = Utils.quantile_normalize(pd_data)
        if np.isnan(pd_data_q.to_numpy()).any():
            logger.error("Bad data, bad normalization")
            return False
        return pd_data_q
예제 #23
0
    def __split_control_perturbed(self, gse, column_entry, type_idx):
        control_series = []
        perturbed_series = []
        for series_name, description in gse.columns.iterrows():
            if Utils.is_control(description[type_idx], self.control_labels):
                column_entry['control'] = description[type_idx]
                control_series.append(series_name)
            else:
                column_entry['perturbed'] = description[type_idx]
                perturbed_series.append(series_name)

        return [control_series, perturbed_series]
예제 #24
0
    def __get_all_raw_geo(self):
        all_data = []
        geo_datas = self.storage.get_geo_data()
        for data in geo_datas:
            geo_id = data["name"]
            valid_c, control = Utils.filter_data(self.logger, data["control"])
            valid_p, perturbed = Utils.filter_data(self.logger,
                                                   data["perturbed"])

            if valid_c is False or valid_p is False:
                self.logger.error(f"Bad data, skiping geo_id {geo_id}")
                continue

            gene_names = data["genes"]
            gene_names = [name.lower() for name in gene_names]
            all_data.append({
                'control': control,
                'perturbed': perturbed,
                'genes': gene_names,
                'geo_id': geo_id
            })
예제 #25
0
    def evaluate(self, group_name):
        fig = plt.figure()
        Utils.create_folder_if_not_exist(self.output_folder)
        save_path = os.path.join(self.output_folder, self.name)
        Utils.create_folder_if_not_exist(save_path)

        for validation_source in self.method_rks.keys():
            method_rks = self.method_rks[validation_source]
            methods = []
            for method_name, rks in method_rks.items():
                rks_array = np.sort(np.array(rks))
                self.plot_cdf(rks_array)
                methods.append(method_name)
            name = f"{group_name}_{validation_source}"
            path_file = f"{name}.png"
            path_method = os.path.join(save_path, path_file)
            plt.legend(methods)
            plt.savefig(path_method)
            plt.title(f"{name}")
            plt.clf()
            fig.clf()
        self.method_rks = {}
        self.logger.info("done")
예제 #26
0
 def setup(self, config):
     self.config = MIDGETXgBoostConfig(**config)
     logger_name = f"MIDGETXgBoost[{self.config.model_name}]"
     self.logger = Utils.get_logger(logger_name)
     model_path = os.path.join(self.config.model_folder,
                               self.config.model_name, 'model.json')
     self.logger.info(f"pir: Loading model: {model_path}")
     if os.path.isfile(model_path):
         bst = xgb.Booster(self.config.param)
         bst.load_model(model_path)
         self.model = bst
     else:
         self.logger.warning(f"No model located in {model_path}")
     pass
    def get_geo(self, filter) -> List[GeoData]:
        if 'name' in filter:
            file_path = os.path.join(self.geo_path, filter['source'],
                                     f"{filter['name']}.json")
            return [self.__load_data_from_file(file_path, GeoData)]

        if 'source' in filter:
            folder = os.path.join(self.geo_path, filter['source'])
        else:
            folder = self.geo_path
            folders = Utils.list_folders_in_folder(folder)
            ret_data = []
            for fld in folders:
                fld_path = os.path.join(folder, fld)
                dt = self.__load_data_from_folder(fld_path, GeoData)
                ret_data.extend(dt)
            return ret_data

        return self.__load_data_from_folder(folder, GeoData)
예제 #28
0
    def generate_single(self, validation_data, id, num_genes) -> GeoData:
        all_tfs = list(validation_data.data.keys())
        picked_tf = random.choice(all_tfs)
        perturbed_genes = set(validation_data.data[picked_tf])
        genes = Utils.get_random_gene_names(num_genes)
        mask = []
        for gene in genes:
            if gene in perturbed_genes:
                mask.append(1)
            else:
                mask.append(0)
        mask = np.array(mask)
        df_factor = self.param.diff_factor
        validation = []
        for index, mask_value in enumerate(mask.tolist()):
            if mask_value == 1:
                validation.append(genes[index])
        num_replicates = self.param.num_replicates
        mask = np.array([mask])
        mask = np.repeat(mask, num_replicates, axis=0).T
        control = np.random.rand(num_genes, num_replicates)
        effect = np.random.rand(num_genes, num_replicates) * df_factor
        perturbation = control + np.multiply(mask, effect)

        gene_data = GeoData({
            "name": f"SIL_{id}",
            "perturbed_series_names": ['fakeseries'],
            "control_series_names": ['fakeseries'],
            "extra_info": {
                "none": "none"
            },
            "perturbed_array": perturbation.tolist(),
            "control_array": control.tolist(),
            "source": self.config.source,
            "genes": genes,
            "pf": picked_tf
        })

        return gene_data
예제 #29
0
 def __do_input(self, input):
     cache_folder = self.config.cache_folder
     cache_path = os.path.join(self.path, cache_folder, input.name)
     created_folder = Utils.create_folder_if_not_exist(cache_path)
     if created_folder:
         self.logger.info(f"created directory {cache_path}")
     log_data = {}
     for data_item in input.data:
         geo_id = data_item['geoid']
         pf = data_item[input.pf_field].lower()
         info_msg = f'Getting GEO: {geo_id} in cache folder {cache_path}'
         self.logger.info(info_msg)
         gse = self.__download_retry(geo_id, cache_path)
         if gse is None:
             sys.exit(f"Failed to download data for {geo_id}")
         geo_data = self.__do_data_item(gse, geo_id, input.name, pf)
         if geo_data:
             geo_data = self.__fix_bad_data(geo_data)
             self.storage.insert_geo(geo_data)
     self.logger.info('Writing collumns to json file')
     log_str = json.dumps(log_data, sort_keys=True, indent=4)
     self.logger.info(f"{log_str}")
     self.logger.info(f'Finished importing GEO data for {input.name}')
예제 #30
0
 def __get_genes(self, gse):
     gene_label = Utils.find_in_array(self.gene_names, gse.table.columns)
     genes_read = gse.table[gene_label].tolist()
     return Utils.deduplicate_genes(genes_read)