def load_data(organism, conf_parser: GenedescConfigParser): logger = logging.getLogger("WB Gene Description Pipeline - Data loader") sister_df = None df_agr = None organisms_info = conf_parser.get_wb_organisms_info() df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) if organism == "c_elegans": df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None) df_agr.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(), ontology_cache_path=os.path.join(conf_parser.get_cache_dir(), "wormbase_agr_human", "go_ontology.obo"), config=conf_parser) df_agr.load_associations_from_file(associations_type=DataType.GO, associations_url=conf_parser.get_wb_human_orthologs_go_associations(), associations_cache_path=os.path.join( conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"), config=conf_parser) if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]: sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"], do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) logger.info("Loading GO data for sister species") sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url, ontology_cache_path=sister_df.go_ontology_cache_path, config=conf_parser) sister_df.load_associations_from_file(associations_type=DataType.GO, associations_url=sister_df.go_associations_url, associations_cache_path=sister_df.go_associations_cache_path, config=conf_parser) logger.info("Loading all data for main species") df.load_all_data_from_file() return df, sister_df, df_agr
def __init__(self, config: GenedescConfigParser, species: str, go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False): """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file locations are automatically generated and stored in class variables ending in _url for remote filed and _cache_path for caching Args: species (str): WormBase species to fetch """ self.config = config raw_files_source = config.get_wb_raw_file_sources() cache_location = config.get_cache_dir() release_version = config.get_wb_release() organisms_info = config.get_wb_organisms_info() project_id = organisms_info[species]["project_id"] self.sister_sp_fullname = "" if "main_sister_species" in organisms_info[species] and "full_name" in \ organisms_info[organisms_info[species]["main_sister_species"]]: self.sister_sp_fullname = organisms_info[ organisms_info[species]["main_sister_species"]]["full_name"] self.orth_fullnames = "" if "ortholog" in organisms_info[species] and all([ "full_name" in organisms_info[ortholog_sp] for ortholog_sp in organisms_info[species]["ortholog"] ]): self.orth_fullnames = [ organisms_info[ortholog_sp]["full_name"] for ortholog_sp in organisms_info[species]["ortholog"] ] expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \ "ec_anatomy_prefix" in organisms_info[species] else None expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \ "ec_molreg_prefix" in organisms_info[species] else None expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \ "ec_genereg_prefix" in organisms_info[species] else None super().__init__(go_relations=go_relations, do_relations=do_relations, use_cache=use_cache) self.gene_data_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".geneIDs.txt.gz") self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz' self.go_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "gene_ontology." + release_version + ".obo") self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \ release_version + '.obo' self.go_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".go_annotations.gaf.gz") self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.go_annotations.gaf.gz' self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \ release_version + '.obo' self.do_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "disease_ontology." + release_version + ".obo") self.do_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.wb") self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.wb' self.do_associations_new_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.daf.txt") self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.daf.txt' self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz' self.orthology_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".orthologs.txt.gz") self.orthologs = defaultdict(lambda: defaultdict(list)) self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.protein_domains.csv.gz' self.protein_domain_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".protein_domains.csv.gz") self.protein_domains = defaultdict(list) self.expression_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_ontology." + release_version + ".obo") self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \ release_version + '.obo' self.expression_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_association." + release_version + ".wb") self.expression_associations_url = raw_files_source + '/' + release_version + \ '/ONTOLOGY/anatomy_association.' + release_version + '.wb' self.expression_cluster_anatomy_url = self._get_expression_cluster_url( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version) self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version, cache_location=cache_location) self.expression_cluster_anatomy_data = defaultdict( list) if self.expression_cluster_anatomy_url else None self.expression_cluster_molreg_url = self._get_expression_cluster_url( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version) self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_molreg_data = defaultdict( list) if self.expression_cluster_molreg_url else None self.expression_cluster_genereg_url = self._get_expression_cluster_url( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version) self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_genereg_data = defaultdict( list) if self.expression_cluster_genereg_url else None