def insert_geo(self, data: GeoData): Utils.create_folder_if_not_exist(self.geo_path) folder = os.path.join(self.geo_path, data.source) Utils.create_folder_if_not_exist(folder) out_path = os.path.join(folder, f"{data.name}.json") with open(out_path, "w") as out: json.dump(data.get_as_dict(), out)
def train(self): features_file_name = self.config.feature_file self.logger.info("started xgb training ") data = self.load_training(features_file_name) random.shuffle(data) split_point = int(len(data) * self.config.data_split) train_data_split = data[:split_point] test_data_split = data[split_point:] num_round = self.config.num_round param = self.config.param self.logger.info( f"params for training {param}, num_rounds: {num_round}") X_train_split, y_train_split = self.get_data(train_data_split) train_data = xgb.DMatrix(data=X_train_split, label=y_train_split) X_test_split, y_test_split = self.get_data(test_data_split) test_data = xgb.DMatrix(data=X_test_split, label=y_test_split) self.logger.info("Finished preparing data, starting training") bst = xgb.train(param, train_data, num_round) self.logger.info("Finished training, starting prediction eval") y_pred = bst.predict(test_data) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test_split, predictions) self.logger.info(f'Prediction accuracy: {accuracy}, saving') model_path = os.path.join(self.config.model_folder, self.config.model_name) Utils.create_folder_if_not_exist(model_path) bst.save_model(os.path.join(model_path, "model.json"))
def evaluate(self, group_name): fig = plt.figure() Utils.create_folder_if_not_exist(self.output_folder) save_path = os.path.join(self.output_folder, self.name) Utils.create_folder_if_not_exist(save_path) for validation_source in self.method_roc.keys(): method_roc = self.method_roc[validation_source] methods = [] for method_name, roc in method_roc.items(): pred = roc['pred'] pred = np.nan_to_num(pred, True, 0.0, 1.0, 0.0) fpr, tpr, _ = roc_curve(roc['y'], pred) score = roc_auc_score(roc['y'], pred) tpr = tpr - uniform.cdf(fpr) plt.plot(fpr, tpr) plt.xlabel('False Positive Rate', fontsize=16) plt.ylabel('True Positive Rate', fontsize=16) methods.append(f"{method_name} AUC {score:.4f}") name = f"{group_name}_{validation_source}" path_file = f"{name}.png" path_method = os.path.join(save_path, path_file) plt.legend(methods) plt.savefig(path_method) plt.title(f"{name}") plt.clf() fig.clf() self.method_roc = {} self.logger.info("done")
def __fix_bad_data(self, data: GeoData): max_replicates = 6 # limit data to most important because we don't have # enough memory to run the R methods with full data if len(data.control_array) < len(data.genes): # some experiments need to be transposed control = np.array(data.control_array).T.tolist() perturbed = np.array(data.perturbed_array).T.tolist() else: control = data.control_array perturbed = data.perturbed_array # for control pick the first replicates control = [x[:max_replicates] for x in control] # for perturbed pick the last replicates perturbed = [x[-max_replicates:] for x in perturbed] # the above pick was done to favorize timeseries experiments control = Utils.log_if_necessary(np.array(control)) perturbed = Utils.log_if_necessary(np.array(perturbed)) control = Utils.quantile_normalize(pd.DataFrame(control)) perturbed = Utils.quantile_normalize(pd.DataFrame(perturbed)) data.control_array = control.to_numpy().tolist() data.perturbed_array = perturbed.to_numpy().tolist() return data
def __init__(self, config_filename): self.logger = Utils.get_logger('generate_and_store_silico') config_raw = Utils.get_config(config_filename, "SilicoData") self.config = GenerateSilicoConfig(**config_raw) self.store = Storage(config_filename) self.generator_manager = SilicoGeneratorsManager(config_filename) self.generator_manager.setup()
def __do_columns_item(self, gse, geo_id, source_name, pf): iter_labels = list(self.type_labels) type_labels = self.type_labels column_entry = {} column_entry['all'] = iter_labels while (iter_labels): type_label = Utils.find_in_array(gse.columns, iter_labels) iter_labels.pop(0) if type_label == 'unknown': error_msg = f'no label geoid {geo_id} labels:{type_labels}' self.logger.error(error_msg) continue type_idx = gse.columns.columns.get_loc(type_label) gene_label = self.__get_genes(gse) if gene_label == 'unknown': self.logger.error( f'no gene label for geoid {geo_id} labels:{type_labels}') continue control_series, perturbed_series = self.__split_control_perturbed( gse, column_entry, type_idx) if not control_series: continue if not perturbed_series: continue np_control_raw = gse.table[control_series].to_numpy() np_perturbed_raw = gse.table[perturbed_series].to_numpy() control = Utils.repair_nan_fast(np_control_raw) perturbed = Utils.repair_nan_fast(np_perturbed_raw) self.logger.info(f'finished {geo_id}') geo_data = GeoData({ "name": geo_id, "genes": gene_label, "source": source_name, "perturbed_series_names": perturbed_series, "control_series_names": control_series, "extra_info": gse.metadata, "perturbed_array": perturbed.tolist(), "control_array": control.tolist(), "pf": pf }) self.experiment_collumns[geo_id] = column_entry return geo_data error_msg = (f"could not split {geo_id} in 2 classes" f"high cols: {gse.columns}") self.logger.error(error_msg) return None
def insert_validation(self, data: GeneDiffValidation): Utils.create_folder_if_not_exist(self.validation_path) out_path = os.path.join(self.validation_path, f"{data.source}.json") with open(out_path, "w") as out: json.dump({ "source": data.source, "data": data.data }, out, cls=Utils.SetEncoder)
def __init__(self, config_filename): self.logger = Utils.get_logger("Benchmark") self.config = Utils.get_config(config_filename, "Benchmark") self.config = BenchamarkDiffMethodsConfig(logger=self.logger, **self.config) self.method_manager = DiffMethodsManager(config_filename) self.method_manager.setup() self.storage = Storage(config_filename) self.metric_manager = MetricManager(config_filename) self.metric_manager.setup()
def __init__(self, config_filename): self.logger = Utils.get_logger('Storage') config_section = Utils.get_config(config_filename, "Storage") self.config = StorageConfig(config_section) self.providers = {} providers = self.config.providers for provider_name, provider_json_config in providers.items(): self.providers[provider_name] = Storage.create_provider( provider_name, provider_json_config) self.logger.info(f"Started storage with config {config_section}")
def insert_method_results(self, result: GeneMethodResult, method_name: str, experiment_name: str): Utils.create_folder_if_not_exist(self.results_path) method_folder = os.path.join(self.results_path, method_name) Utils.create_folder_if_not_exist(method_folder) file_output = os.path.join(method_folder, f"{experiment_name}.json") with open(file_output, "w") as out: output = result.to_dict() json.dump(output, out)
def __init__(self, config_filename): GEOparse.set_verbosity("ERROR") self.config_filename = config_filename data_section = Utils.get_config(config_filename, 'GEOImporter') self.config = GEOImporterConfig(**data_section) self.logger = Utils.get_logger('GEOImporter') self.storage = Storage(config_filename) self.labels = self.config.labeling self.inputs = self.config.input_data self.control_labels = self.labels.control self.type_labels = self.labels.type self.gene_names = self.labels.gene_names self.path = self.config.data_path self.experiment_collumns = {}
def __do_no_colums_item(self, gse, geo_id, source_name, pf): control = self.labels.no_column_control phenotype_data = gse.phenotype_data columns = phenotype_data.columns info_experiment_idx = columns.get_loc(self.labels.no_column_title) gsm_ids_idx = columns.get_loc(self.labels.no_column_accession) gsm_type = list(phenotype_data.values[:, info_experiment_idx]) gsm_ids = list(phenotype_data.values[:, gsm_ids_idx]) control_gsms = [] perturbation_gsms = [] raw_control_data = [] raw_perturbed_data = [] for idx in range(0, len(gsm_type)): gsm_id = gsm_ids[idx] table = gse.gsms[gsm_id].table value_idx = table.columns.get_loc('VALUE') values = gse.gsms[gsm_id].table.values[:, value_idx].tolist() if Utils.find_in_array(gsm_type[idx], control) != 'unknown': control_gsms.append(gsm_id) raw_control_data.append(values) else: perturbation_gsms.append(gsm_id) raw_perturbed_data.append(values) if not control_gsms: self.logger('[no col]no control for {geo_id}') return None genes = gse.gsms[control_gsms[0]].table.values[:, 0] np_control_raw = np.array(raw_control_data) np_perturbed_raw = np.array(raw_perturbed_data) control = Utils.repair_nan_fast(np_control_raw) perturbed = Utils.repair_nan_fast(np_perturbed_raw) self.logger.info(f'finished {geo_id}') geo_data = GeoData({ "name": geo_id, "genes": genes.tolist(), "source": source_name, "perturbed_series_names": perturbation_gsms, "control_series_names": control_gsms, "extra_info": gse.metadata, "perturbed_array": perturbed.tolist(), "control_array": control.tolist(), "pf": pf }) return geo_data
def __init__(self, config, output_folder): self.name = "ROC" self.logger = Utils.get_logger("metric_roc") self.config = config self.output_folder = output_folder self.method_roc = {} pass
def __from_attribute_matrix(self, name, dict_file_path, data_file_path, base_path): dict_path = os.path.join(base_path, dict_file_path) data_path = os.path.join(base_path, data_file_path) collect = {} new_links = 0 new_tfs = 0 attributes = {} self.logger.info(f"Doing db {name}") attributes = self.__load_dict_attributes(dict_path) with open(data_path) as tsvfile: reader = self.__get_tsv_reader(tsvfile) for row in reader: popGene = row['GeneSym'].lower() for key, value in row.items(): if key not in attributes: continue value_float = float(value) if Utils.isclose(value_float, 0.0): continue tf_name = attributes[key].lower() if tf_name in collect: if popGene not in collect[tf_name]: collect[tf_name].add(popGene) new_links = new_links + 1 else: collect[tf_name] = set([popGene]) new_tfs = new_tfs + 1 message = (f"TF db:{name }", f"new tfs: {new_tfs}", f"new links: {new_links}") self.logger.info(message) return collect
def __init__(self, config, output_folder): self.name = "Kolmogorov" self.logger = Utils.get_logger("metric_kolmogorov") self.method_rks = {} self.config = config self.output_folder = output_folder pass
def generate_validation_data(self, num_genes, num_pfs) -> GeneDiffValidation: gene_names = Utils.get_random_gene_names(num_genes) pf_names = Utils.get_random_tf_names(num_pfs) data = {} for pf in pf_names: num = random.randint(2, len(gene_names)) choice_list = random.choices(gene_names, k=num) choice_set = set(choice_list) choice_list = list(choice_set) data[pf] = choice_list validation_data = GeneDiffValidation() validation_data.source = "silico" validation_data.data = data return validation_data
def plot_cdf(self, data): x, y = Utils.ecdf(data) x = np.append(x, [1.0]) y = np.append(y, [1.0]) y = y - uniform.cdf(x) plt.plot(x, y) plt.xlabel('rank', fontsize=16) plt.ylabel('cdf(r)-r', fontsize=16)
def setup_default_data(config_filename): data_section = Utils.get_config(config_filename, 'GEOImporter') config = GEOImporterConfig(**data_section) default_data_url = 'https://github.com/raduangelescu/GeneBench/raw/main/genebench-data.7z' download_file = os.path.join(config.data_path, "genebench-data.7z") wget.download(default_data_url, download_file) with py7zr.SevenZipFile(download_file, mode='r') as z: z.extractall(config.data_path) os.remove(download_file)
def setup(self, config): self.config = MIDGETNeuralConfig(**config) logger_name = f"MIDGETNeural[{self.config.model_name}]" self.logger = Utils.get_logger(logger_name) model_path = os.path.join(self.config.output_folder, self.config.model_name, "model") self.logger.info(f"Loading model: {model_path}") self.model = tf.keras.models.load_model(model_path)
def __init__(self, config: dict): self.config = FileSystemConfig(config) base_path = self.config.base_path Utils.create_folder_if_not_exist(base_path) self.geo_path = os.path.join(base_path, self.config.geo_folder) Utils.create_folder_if_not_exist(self.geo_path) self.validation_path = os.path.join(base_path, self.config.validation_folder) Utils.create_folder_if_not_exist(self.validation_path) self.results_path = os.path.join(base_path, self.config.results_folder) Utils.create_folder_if_not_exist(self.results_path)
def evaluate(self, group_name): Utils.create_folder_if_not_exist(self.output_folder) save_path = os.path.join(self.output_folder, self.name) Utils.create_folder_if_not_exist(save_path) for validation_source in self.method_f1.keys(): method_f1 = self.method_f1[validation_source] methods = [] for method_name, _f1 in method_f1.items(): pred = _f1['pred'] pred = np.nan_to_num(pred, True, 0.0, 1.0, 0.0) f1 = f1_score(_f1['y'], pred) methods.append(f"{method_name} F1 Score: {f1:.4f}") name = f"{group_name}_{validation_source}" path_file = f"{name}.txt" path_method = os.path.join(save_path, path_file) with open(path_method, mode='wt', encoding='utf-8') as out_scores: out_scores.write('\n'.join(methods)) self.method_f1 = {} self.logger.info("done")
def filter_data(self, logger, data): np_data = np.array(data) if np.isnan(np_data): logger.warning("Bad data, we need to fix NAN and Inf") np_data = np.nan_to_num(np_data, nan=0.0, posinf=99999.0, neginf=-99999.0) np_data = Utils.log_if_necessary(np_data.T) if np.isnan(np_data).any(): logger.error("Bad data, not log") return False pd_data = pd.DataFrame(np_data) pd_data_q = Utils.quantile_normalize(pd_data) if np.isnan(pd_data_q.to_numpy()).any(): logger.error("Bad data, bad normalization") return False return pd_data_q
def __split_control_perturbed(self, gse, column_entry, type_idx): control_series = [] perturbed_series = [] for series_name, description in gse.columns.iterrows(): if Utils.is_control(description[type_idx], self.control_labels): column_entry['control'] = description[type_idx] control_series.append(series_name) else: column_entry['perturbed'] = description[type_idx] perturbed_series.append(series_name) return [control_series, perturbed_series]
def __get_all_raw_geo(self): all_data = [] geo_datas = self.storage.get_geo_data() for data in geo_datas: geo_id = data["name"] valid_c, control = Utils.filter_data(self.logger, data["control"]) valid_p, perturbed = Utils.filter_data(self.logger, data["perturbed"]) if valid_c is False or valid_p is False: self.logger.error(f"Bad data, skiping geo_id {geo_id}") continue gene_names = data["genes"] gene_names = [name.lower() for name in gene_names] all_data.append({ 'control': control, 'perturbed': perturbed, 'genes': gene_names, 'geo_id': geo_id })
def evaluate(self, group_name): fig = plt.figure() Utils.create_folder_if_not_exist(self.output_folder) save_path = os.path.join(self.output_folder, self.name) Utils.create_folder_if_not_exist(save_path) for validation_source in self.method_rks.keys(): method_rks = self.method_rks[validation_source] methods = [] for method_name, rks in method_rks.items(): rks_array = np.sort(np.array(rks)) self.plot_cdf(rks_array) methods.append(method_name) name = f"{group_name}_{validation_source}" path_file = f"{name}.png" path_method = os.path.join(save_path, path_file) plt.legend(methods) plt.savefig(path_method) plt.title(f"{name}") plt.clf() fig.clf() self.method_rks = {} self.logger.info("done")
def setup(self, config): self.config = MIDGETXgBoostConfig(**config) logger_name = f"MIDGETXgBoost[{self.config.model_name}]" self.logger = Utils.get_logger(logger_name) model_path = os.path.join(self.config.model_folder, self.config.model_name, 'model.json') self.logger.info(f"pir: Loading model: {model_path}") if os.path.isfile(model_path): bst = xgb.Booster(self.config.param) bst.load_model(model_path) self.model = bst else: self.logger.warning(f"No model located in {model_path}") pass
def get_geo(self, filter) -> List[GeoData]: if 'name' in filter: file_path = os.path.join(self.geo_path, filter['source'], f"{filter['name']}.json") return [self.__load_data_from_file(file_path, GeoData)] if 'source' in filter: folder = os.path.join(self.geo_path, filter['source']) else: folder = self.geo_path folders = Utils.list_folders_in_folder(folder) ret_data = [] for fld in folders: fld_path = os.path.join(folder, fld) dt = self.__load_data_from_folder(fld_path, GeoData) ret_data.extend(dt) return ret_data return self.__load_data_from_folder(folder, GeoData)
def generate_single(self, validation_data, id, num_genes) -> GeoData: all_tfs = list(validation_data.data.keys()) picked_tf = random.choice(all_tfs) perturbed_genes = set(validation_data.data[picked_tf]) genes = Utils.get_random_gene_names(num_genes) mask = [] for gene in genes: if gene in perturbed_genes: mask.append(1) else: mask.append(0) mask = np.array(mask) df_factor = self.param.diff_factor validation = [] for index, mask_value in enumerate(mask.tolist()): if mask_value == 1: validation.append(genes[index]) num_replicates = self.param.num_replicates mask = np.array([mask]) mask = np.repeat(mask, num_replicates, axis=0).T control = np.random.rand(num_genes, num_replicates) effect = np.random.rand(num_genes, num_replicates) * df_factor perturbation = control + np.multiply(mask, effect) gene_data = GeoData({ "name": f"SIL_{id}", "perturbed_series_names": ['fakeseries'], "control_series_names": ['fakeseries'], "extra_info": { "none": "none" }, "perturbed_array": perturbation.tolist(), "control_array": control.tolist(), "source": self.config.source, "genes": genes, "pf": picked_tf }) return gene_data
def __do_input(self, input): cache_folder = self.config.cache_folder cache_path = os.path.join(self.path, cache_folder, input.name) created_folder = Utils.create_folder_if_not_exist(cache_path) if created_folder: self.logger.info(f"created directory {cache_path}") log_data = {} for data_item in input.data: geo_id = data_item['geoid'] pf = data_item[input.pf_field].lower() info_msg = f'Getting GEO: {geo_id} in cache folder {cache_path}' self.logger.info(info_msg) gse = self.__download_retry(geo_id, cache_path) if gse is None: sys.exit(f"Failed to download data for {geo_id}") geo_data = self.__do_data_item(gse, geo_id, input.name, pf) if geo_data: geo_data = self.__fix_bad_data(geo_data) self.storage.insert_geo(geo_data) self.logger.info('Writing collumns to json file') log_str = json.dumps(log_data, sort_keys=True, indent=4) self.logger.info(f"{log_str}") self.logger.info(f'Finished importing GEO data for {input.name}')
def __get_genes(self, gse): gene_label = Utils.find_in_array(self.gene_names, gse.table.columns) genes_read = gse.table[gene_label].tolist() return Utils.deduplicate_genes(genes_read)