class TestConfigParser(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) def test_exclude_terms_list(self): self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0, "GO exclusion term list not loading") self.assertTrue(len(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0, "DO terms exclusion not loading") def test_rename_terms(self): self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS)) == 7, "GO term renaming list not loading") self.assertTrue(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.RENAME_TERMS) is None, "DO term renaming list should be None") def test_evidence_codes(self): self.assertTrue("EXP" in list(self.conf_parser.get_evidence_codes_groups_map(module=Module.GO).keys()))
def set_associations(self, associations_type: DataType, associations: AssociationSet, config: GenedescConfigParser): """set the go annotations and remove blacklisted annotations Args: associations_type (DataType): the type of associations to set associations (AssociationSet): an association object to set as go annotations config (GenedescConfigParser): configuration object where to read properties """ if associations_type == DataType.GO: logger.info("Setting GO associations") self.go_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Setting DO associations") self.do_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Setting Expression associations") self.expression_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser)
def load_data(organism, conf_parser: GenedescConfigParser): logger = logging.getLogger("WB Gene Description Pipeline - Data loader") sister_df = None df_agr = None organisms_info = conf_parser.get_wb_organisms_info() df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) if organism == "c_elegans": df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None) df_agr.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(), ontology_cache_path=os.path.join(conf_parser.get_cache_dir(), "wormbase_agr_human", "go_ontology.obo"), config=conf_parser) df_agr.load_associations_from_file(associations_type=DataType.GO, associations_url=conf_parser.get_wb_human_orthologs_go_associations(), associations_cache_path=os.path.join( conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"), config=conf_parser) if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]: sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"], do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) logger.info("Loading GO data for sister species") sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url, ontology_cache_path=sister_df.go_ontology_cache_path, config=conf_parser) sister_df.load_associations_from_file(associations_type=DataType.GO, associations_url=sister_df.go_associations_url, associations_cache_path=sister_df.go_associations_cache_path, config=conf_parser) logger.info("Loading all data for main species") df.load_all_data_from_file() return df, sister_df, df_agr
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue( any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue( len( self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority( module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue( all( len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property( module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): pass
def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, conf_parser: GenedescConfigParser, human_df_agr: DataManager, gene_desc: GeneDescription, dm: WBDataManager, gene: Gene): if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": best_orth = get_best_human_ortholog_for_info_poor( selected_orthologs, conf_parser.get_annotations_priority(module=Module.GO), human_df_agr, config=conf_parser) if best_orth: if not best_orth.startswith("RGD:"): best_orth = "RGD:" + best_orth human_go_sent_generator = OntologySentenceGenerator( gene_id=best_orth, module=Module.GO, data_manager=human_df_agr, config=conf_parser, humans=False, limit_to_group="EXPERIMENTAL") human_func_module_sentences = human_go_sent_generator.get_module_sentences( aspect='F', qualifier="contributes_to", merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[best_orth] + " " + human_func_sent) human_func_module_sentences = human_go_sent_generator.get_module_sentences( aspect='F', qualifier="enables", merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[best_orth] + " " + human_func_sent) protein_domains = dm.protein_domains[gene_desc.gene_id[3:]] if protein_domains: dom_word = "domain" if len([ptdom[1] for ptdom in protein_domains if ptdom[1] != ""]) > 1: dom_word = "domains" gene_desc.set_or_extend_module_description_and_final_stats( module=Module.PROTEIN_DOMAIN, description="is predicted to encode a protein with the following " + dom_word + ": " + concatenate_words_with_oxford_comma( [ptdom[1] for ptdom in protein_domains if ptdom[1] != ""], separator=conf_parser.get_terms_delimiter()))
def load_associations_from_file(self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser) -> None: """load go associations from file Args: associations_type (DataType): the type of associations to set associations_url (str): url to the association file associations_cache_path (str): path to cache file for the associations config (GenedescConfigParser): configuration object where to read properties """ assoc_config = AssocParserConfig(remove_double_prefixes=True, paint=True) if associations_type == DataType.GO: logger.info("Loading GO associations from file") self.go_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.go_ontology) self.go_associations = self.remove_blacklisted_annotations( association_set=self.go_associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Loading DO associations from file") self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXP_AND_BIO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Loading Expression associations from file") self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file(cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def setUp(self): logging.basicConfig( filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml"))
def set_ontology(self, ontology_type: DataType, ontology: Ontology, config: GenedescConfigParser, slim_cache_path: str = None) -> None: """set the go ontology and apply terms renaming Args: ontology_type (DataType): the type of ontology to set ontology (Ontology): an ontology object to set as go ontology config (GenedescConfigParser): configuration object where to read properties slim_cache_path (str): path to slim file to use """ if ontology_type == DataType.GO: logger.info("Setting GO ontology") if self.go_relations: self.go_ontology = ontology.subontology( relations=self.go_relations) else: self.go_ontology = ontology elif ontology_type == DataType.DO: logger.info("Setting DO ontology") if self.do_relations: self.do_ontology = ontology.subontology( relations=self.do_relations) else: self.do_ontology = ontology elif ontology_type == DataType.EXPR: logger.info("Setting Expression ontology") if self.expr_relations: self.expression_ontology = ontology.subontology( relations=self.expr_relations) else: self.expression_ontology = ontology module = get_module_from_data_type(ontology_type) ontology = self.get_ontology(data_type=ontology_type) terms_replacement_regex = config.get_module_property( module=module, prop=ConfigModuleProperty.RENAME_TERMS) if terms_replacement_regex: self.rename_ontology_terms( ontology=ontology, terms_replacement_regex=terms_replacement_regex) set_all_depths(ontology=ontology, relations=self.get_relations(ontology_type)) if config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "ic": set_ic_ontology_struct(ontology=ontology, relations=self.get_relations(ontology_type)) if slim_cache_path: slim_url = config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_URL) self.load_slim(module=module, slim_url=slim_url, slim_cache_path=slim_cache_path)
def test_compose_sentence(self): this_dir = os.path.split(__file__)[0] conf_parser = GenedescConfigParser( os.path.join(this_dir, os.path.pardir, "tests", "config_test.yml")) sentence = compose_sentence( prefix="Is expressed in", additional_prefix="several processes, including", term_names=["cell", "tail", "head", "male"], postfix="based on experimental observation", config=conf_parser, ancestors_with_multiple_children={"head"}, rename_cell=True, put_anatomy_male_at_end=True) self.assertTrue("cell" not in sentence) self.assertTrue("and in male" in sentence) sentence = compose_sentence( prefix="Is expressed in", additional_prefix="several processes, including", term_names=["cell"], postfix="based on experimental observation", config=conf_parser, rename_cell=True) self.assertTrue( sentence == "Is expressed widely based on experimental observation")
def load_do_ontology(self): logger.info("Starting Ontology Tools tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None) logger.info("Loading do ontology from file") logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join( self.this_dir, "data", "doid.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"), config=self.conf_parser)
def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, "config_test_wb.yml")) self.df = WBDataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=self.conf_parser, species="c_elegans")
def set_associations(self, associations_type: DataType, associations: AssociationSet, config: GenedescConfigParser): """set the go annotations and remove blacklisted annotations Args: associations_type (DataType): the type of associations to set associations (AssociationSet): an association object to set as go annotations config (GenedescConfigParser): configuration object where to read properties """ assocs = self.remap_associations( associations=associations, ontology=self.get_ontology(associations_type), associations_map=config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.REMAP_TERMS)) assocs = self.remove_blacklisted_annotations( association_set=assocs, ontology=self.get_ontology(associations_type), terms_blacklist=config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.EXCLUDE_TERMS)) if associations_type == DataType.GO: logger.info("Setting GO associations") self.go_associations = assocs elif associations_type == DataType.DO: logger.info("Setting DO associations") self.do_associations = assocs elif associations_type == DataType.EXPR: logger.info("Setting Expression associations") self.expression_associations = assocs if config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "icGO": set_ic_annot_freq(self.get_ontology(associations_type), self.get_associations(associations_type))
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription, config: GenedescConfigParser, excluded_orthologs: bool = False): """set orthology module for Alliance human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name gene_desc (GeneDescription): the gene description object to update config (GenedescConfigParser): a gene descriptions configuration object excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the final sentence will include a prefix to specify that some orthologs have been omitted """ if len(orthologs) > 0: prefix = "human" orthologs_display = sorted(orthologs, key=lambda x: x[2]) if excluded_orthologs or len(orthologs) > 3: orthologs_display = orthologs_display[0:3] prefix = "several human genes including" sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma( [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display], separator=config.get_terms_delimiter()) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
def generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human_genes_props: Dict[str, List[str]], config: GenedescConfigParser): """build orthology sentence for WormBase human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties config (GenedescConfigParser): a gene description configuration object Returns: Tuple[list, str]: the orthologs and the sentence """ prefix = "human " if len(orthologs) > 3: orthologs = orthologs[0:3] prefix = "several human genes including " symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] + ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else best_orth[1] for best_orth in orthologs]) orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma( symbol_name_arr, separator=config.get_terms_delimiter()) return [human_genes_props[best_orth[0]][0] for best_orth in orthologs if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]]], orth_sentence
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ self.ontology = data_manager.get_ontology( get_data_type_from_module(module)) self.config = config self.module = module self.terms_already_covered = set() self.terms_groups = defaultdict(lambda: defaultdict(set)) self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.gene_annots = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=get_data_type_from_module(module), priority_list=config.get_annotations_priority(module=module)) self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)]( ontology=self.ontology, annotations=data_manager.get_associations( get_data_type_from_module(module)), nodeids_blacklist=config.get_module_property( module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS), slim_terms_ic_bonus_perc=config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC), slim_set=data_manager.get_slim(module=module)) self.set_terms_groups(module, config, limit_to_group, humans)
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue(any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue(len(self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO) self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot])) def test_download_gz_file(self): test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"), file_source_url="file://" + os.path.join( self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz")) self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt")) def test_gene_data_functions(self): self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True), Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)]) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=False)]) == 1) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=False)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=True)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=True)]) == 4) def test_get_human_gene_props(self): human_gene_props = self.df.get_human_gene_props() self.assertTrue(len(human_gene_props) > 0) def test_get_ensembl_hgnc_ids_map(self): ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map() self.assertTrue(len(ensembl_hgnc_ids_map) > 0) def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes())) def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations) def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def get_module_sentences(self, config: GenedescConfigParser, aspect: str, qualifier: str = '', keep_only_best_group: bool = False, merge_groups_with_same_prefix: bool = False, high_priority_term_ids: List[str] = None): """generate description for a specific combination of aspect and qualifier Args: config (GenedescConfigParser): a configuration object from which to read properties aspect (str): a data type aspect qualifier (str): qualifier keep_only_best_group (bool): whether to get only the evidence group with highest priority and discard the other evidence groups merge_groups_with_same_prefix (bool): whether to merge the phrases for evidence groups with the same prefix high_priority_term_ids (List[str]): list of ids for terms that must always appear in the sentence with higher priority than the other terms. Trimming is not applied to these terms Returns: ModuleSentences: the module sentences """ cat_several_words = config.get_module_property( module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD) del_overlap = config.get_module_property( module=self.module, prop=ConfigModuleProperty.REMOVE_OVERLAP) remove_parents = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DEL_PARENTS_IF_CHILD) remove_child_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DEL_CHILDREN_IF_PARENT) max_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE) exclude_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS) cutoff_final_word = config.get_module_property( module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD) rename_cell = config.get_module_property( module=self.module, prop=ConfigModuleProperty.RENAME_CELL) if not cat_several_words: cat_several_words = { 'F': 'functions', 'P': 'processes', 'C': 'components', 'D': 'diseases', 'A': 'tissues' } sentences = [] terms_already_covered = set() evidence_group_priority = { eg: p for p, eg in enumerate(self.evidence_groups_priority_list) } for terms, evidence_group, priority in sorted( [(t, eg, evidence_group_priority[eg]) for eg, t in self.terms_groups[(aspect, qualifier)].items()], key=lambda x: x[2]): terms, trimmed, add_others, ancestors_covering_multiple_children = self.reduce_terms( terms, max_terms, aspect, config, del_overlap, terms_already_covered, exclude_terms, remove_parents, remove_child_terms, high_priority_term_ids) if (aspect, evidence_group, qualifier ) in self.prepostfix_sentences_map and len(terms) > 0: sentences.append( _get_single_sentence( node_ids=terms, ontology=self.ontology, aspect=aspect, evidence_group=evidence_group, qualifier=qualifier, prepostfix_sentences_map=self.prepostfix_sentences_map, terms_merged=False, trimmed=trimmed, add_others=add_others, truncate_others_generic_word=cutoff_final_word, truncate_others_aspect_words=cat_several_words, ancestors_with_multiple_children= ancestors_covering_multiple_children, rename_cell=rename_cell)) if keep_only_best_group: return ModuleSentences(sentences) if merge_groups_with_same_prefix: sentences = self.merge_sentences_with_same_prefix( sentences=sentences, remove_parent_terms=remove_parents, rename_cell=rename_cell, high_priority_term_ids=high_priority_term_ids) return ModuleSentences(sentences)
def __init__(self, config: GenedescConfigParser, species: str, go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False): """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file locations are automatically generated and stored in class variables ending in _url for remote filed and _cache_path for caching Args: species (str): WormBase species to fetch """ self.config = config raw_files_source = config.get_wb_raw_file_sources() cache_location = config.get_cache_dir() release_version = config.get_wb_release() organisms_info = config.get_wb_organisms_info() project_id = organisms_info[species]["project_id"] self.sister_sp_fullname = "" if "main_sister_species" in organisms_info[species] and "full_name" in \ organisms_info[organisms_info[species]["main_sister_species"]]: self.sister_sp_fullname = organisms_info[ organisms_info[species]["main_sister_species"]]["full_name"] self.orth_fullnames = "" if "ortholog" in organisms_info[species] and all([ "full_name" in organisms_info[ortholog_sp] for ortholog_sp in organisms_info[species]["ortholog"] ]): self.orth_fullnames = [ organisms_info[ortholog_sp]["full_name"] for ortholog_sp in organisms_info[species]["ortholog"] ] expression_cluster_anatomy_prefix = organisms_info[species]["ec_anatomy_prefix"] if \ "ec_anatomy_prefix" in organisms_info[species] else None expression_cluster_molreg_prefix = organisms_info[species]["ec_molreg_prefix"] if \ "ec_molreg_prefix" in organisms_info[species] else None expression_cluster_genereg_prefix = organisms_info[species]["ec_genereg_prefix"] if \ "ec_genereg_prefix" in organisms_info[species] else None super().__init__(go_relations=go_relations, do_relations=do_relations, use_cache=use_cache) self.gene_data_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".geneIDs.txt.gz") self.gene_data_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.geneIDs.txt.gz' self.go_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "gene_ontology." + release_version + ".obo") self.go_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/gene_ontology.' + \ release_version + '.obo' self.go_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".go_annotations.gaf.gz") self.go_associations_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.go_annotations.gaf.gz' self.do_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_ontology.' + \ release_version + '.obo' self.do_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "disease_ontology." + release_version + ".obo") self.do_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.wb") self.do_associations_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.wb' self.do_associations_new_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".do_annotations.daf.txt") self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.daf.txt' self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz' self.orthology_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".orthologs.txt.gz") self.orthologs = defaultdict(lambda: defaultdict(list)) self.protein_domain_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + \ project_id + '/annotation/' + species + '.' + project_id + '.' + release_version + \ '.protein_domains.csv.gz' self.protein_domain_cache_path = os.path.join( cache_location, "wormbase", release_version, "species", species, project_id, "annotation", species + '.' + project_id + '.' + release_version + ".protein_domains.csv.gz") self.protein_domains = defaultdict(list) self.expression_ontology_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_ontology." + release_version + ".obo") self.expression_ontology_url = raw_files_source + '/' + release_version + '/ONTOLOGY/anatomy_ontology.' + \ release_version + '.obo' self.expression_associations_cache_path = os.path.join( cache_location, "wormbase", release_version, "ONTOLOGY", "anatomy_association." + release_version + ".wb") self.expression_associations_url = raw_files_source + '/' + release_version + \ '/ONTOLOGY/anatomy_association.' + release_version + '.wb' self.expression_cluster_anatomy_url = self._get_expression_cluster_url( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version) self.expression_cluster_anatomy_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_anatomy_prefix, ec_type="anatomy", release_version=release_version, cache_location=cache_location) self.expression_cluster_anatomy_data = defaultdict( list) if self.expression_cluster_anatomy_url else None self.expression_cluster_molreg_url = self._get_expression_cluster_url( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version) self.expression_cluster_molreg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_molreg_prefix, ec_type="molReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_molreg_data = defaultdict( list) if self.expression_cluster_molreg_url else None self.expression_cluster_genereg_url = self._get_expression_cluster_url( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version) self.expression_cluster_genereg_cache_path = self._get_expression_cluster_cache_path( prefix=expression_cluster_genereg_prefix, ec_type="geneReg", release_version=release_version, cache_location=cache_location) self.expression_cluster_genereg_data = defaultdict( list) if self.expression_cluster_genereg_url else None
def load_associations_from_file( self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser, association_additional_url: str = None, association_additional_cache_path: str = None) -> None: logger.info("Loading associations from file") if associations_type == DataType.GO: super().load_associations_from_file( associations_type=associations_type, associations_url=associations_url, associations_cache_path=associations_cache_path, config=config) elif associations_type == DataType.EXPR: associations = [] file_path = self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url) for line in open(file_path): if not line.strip().startswith("!"): linearr = line.strip().split("\t") if self.expression_ontology.node(linearr[4]): gene_id = linearr[0] + ":" + linearr[1] qualifiers = linearr[3].split("|") if len( qualifiers ) == 0 or "Partial" in qualifiers or "Certain" in qualifiers: qualifiers = ["Verified"] associations.append( DataManager.create_annotation_record( line, gene_id, linearr[2], linearr[11], linearr[12], linearr[4], qualifiers, linearr[8], linearr[6], linearr[5].split("|"), linearr[14], linearr[13])) self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser().parse(file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) if association_additional_cache_path and association_additional_url: associations = [] for subj_associations in self.do_associations.associations_by_subj.values( ): for association in subj_associations: if association["evidence"]["type"] == "IEA": associations.append(association) file_path = self._get_cached_file( cache_path=association_additional_cache_path, file_source_url=association_additional_url) header = True for line in open(file_path): if not line.strip().startswith("!"): if not header: linearr = line.strip().split("\t") if self.do_ontology.node( linearr[10]) and linearr[16] != "IEA": gene_ids = [linearr[2]] if linearr[1] == "allele": gene_ids = linearr[4].split(",") for gene_id in gene_ids: associations.append( DataManager.create_annotation_record( line, gene_id, linearr[3], linearr[1], linearr[0], linearr[10], linearr[9].split("|"), "D", linearr[16], linearr[18].split("|"), linearr[20], linearr[19])) else: header = False self.do_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ annot_type = None if module == Module.DO_ORTHOLOGY or module == Module.DO_EXPERIMENTAL or module == module.DO_BIOMARKER: self.ontology = data_manager.do_ontology annot_type = DataType.DO elif module == Module.GO: self.ontology = data_manager.go_ontology annot_type = DataType.GO elif module == Module.EXPRESSION: self.ontology = data_manager.expression_ontology annot_type = DataType.EXPR self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.terms_groups = defaultdict(lambda: defaultdict(set)) ev_codes_groups_maps = config.get_evidence_codes_groups_map( module=module) annotations = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=annot_type, priority_list=config.get_annotations_priority(module=module)) self.annotations = annotations self.module = module self.data_manager = data_manager self.annot_type = annot_type evidence_codes_groups_map = { evcode: group for evcode, group in ev_codes_groups_maps.items() if limit_to_group is None or limit_to_group in ev_codes_groups_maps[evcode] } prepostfix_special_cases_sent_map = config.get_prepostfix_sentence_map( module=module, special_cases_only=True, humans=humans) if len(annotations) > 0: for annotation in annotations: if annotation["evidence"]["type"] in evidence_codes_groups_map: aspect = annotation["aspect"] ev_group = evidence_codes_groups_map[annotation["evidence"] ["type"]] qualifier = "_".join( sorted(annotation["qualifiers"] )) if "qualifiers" in annotation else "" if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \ prepostfix_special_cases_sent_map: for special_case in prepostfix_special_cases_sent_map[( aspect, ev_group, qualifier)]: if re.match( re.escape(special_case[1]), self.ontology.label( annotation["object"]["id"], id_if_null=True)): ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \ str(special_case[0]) if ev_group not in self.evidence_groups_priority_list: self.evidence_groups_priority_list.insert( self.evidence_groups_priority_list. index(evidence_codes_groups_map[ annotation["evidence"]["type"]]) + 1, ev_group) break self.terms_groups[(aspect, qualifier)][ev_group].add( annotation["object"]["id"])
def main(): parser = argparse.ArgumentParser( description="Generate gene descriptions for wormbase") parser.add_argument("-c", "--config-file", metavar="config_file", dest="config_file", type=str, default="config.yml", help="configuration file. Default ./config.yaml") parser.add_argument( "-C", "--use-cache", dest="use_cache", action="store_true", default=False, help= "Use cached source files from cache_location specified in config file. Download them from " "raw_file_source (configured in config file) if not yet cached") parser.add_argument( "-l", "--log-file", metavar="log_file", dest="log_file", type=str, default=None, help="path to the log file to generate. Default ./genedescriptions.log" ) parser.add_argument( "-L", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="set the logging level") parser.add_argument("-t", "--textpressoapi-token", metavar="textpresso_token", dest="textpresso_token", type=str, help="Texpresso api token") parser.add_argument("-o", "--output-formats", metavar="output_formats", dest="output_formats", type=str, nargs="+", default=["ace", "txt", "json", "tsv"], help="file formats to generate. Accepted values " "are: ace, txt, json, tsv") args = parser.parse_args() conf_parser = GenedescConfigParser(args.config_file) logging.basicConfig(filename=args.log_file, level=args.log_level, format='%(asctime)s - %(name)s - %(levelname)s:' '%(message)s', force=True) logger = logging.getLogger("WB Gene Description Pipeline") organisms_list = conf_parser.get_wb_organisms_to_process() human_genes_props = DataManager.get_human_gene_props() api_manager = APIManager(textpresso_api_token=args.textpresso_token) for organism in organisms_list: logger.info("Processing organism " + organism) species = conf_parser.get_wb_organisms_info() dm, sister_df, df_agr = load_data(organism=organism, conf_parser=conf_parser) desc_writer = DescriptionsWriter() desc_writer.overall_properties.species = organism desc_writer.overall_properties.release_version = conf_parser.get_wb_release( )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1) desc_writer.overall_properties.date = datetime.date.today().strftime( "%B %d, %Y") for gene in dm.get_gene_data(): logger.debug("Generating description for gene " + gene.name) gene_desc = GeneDescription(gene_id=gene.id, config=conf_parser, gene_name=gene.name, add_gene_name=False) selected_orthologs, orth_sent = get_best_orthologs_and_sentence( dm=dm, orth_fullnames=dm.orth_fullnames, human_genes_props=human_genes_props, gene_desc=gene_desc, api_manager=api_manager, config=conf_parser) set_gene_ontology_module(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene) set_tissue_expression_sentence(dm=dm, gene=gene, conf_parser=conf_parser, gene_desc=gene_desc) if not gene_desc.description: set_expression_cluster_sentence(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene, api_manager=api_manager) set_disease_module(df=dm, conf_parser=conf_parser, gene=gene, gene_desc=gene_desc) if not gene_desc.go_description: set_information_poor_sentence( orth_fullnames=dm.orth_fullnames, selected_orthologs=selected_orthologs, conf_parser=conf_parser, human_df_agr=df_agr, gene_desc=gene_desc, dm=dm, gene=gene) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.ORTHOLOGY, description=orth_sent) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \ dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"])[0]: set_sister_species_sentence( dm=dm, sister_sp_fullname=dm.sister_sp_fullname, sister_df=sister_df, species=species, organism=organism, gene_desc=gene_desc, conf_parser=conf_parser, gene=gene) desc_writer.add_gene_desc(gene_desc) logger.info("All genes processed for " + organism) date_prefix = datetime.date.today().strftime("%Y%m%d") if "json" in args.output_formats: logger.info("Writing descriptions to json") desc_writer.write_json(os.path.join( conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"), include_single_gene_stats=True, data_manager=dm) if "txt" in args.output_formats: logger.info("Writing descriptions to txt") desc_writer.write_plain_text( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt")) if "tsv" in args.output_formats: logger.info("Writing descriptions to tsv") desc_writer.write_tsv( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".tsv")) if "ace" in args.output_formats: logger.info("Writing descriptions to ace") curators = ["WBPerson324", "WBPerson37462"] release_version = conf_parser.get_wb_release() desc_writer.write_ace( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".ace"), curators, release_version)
def set_expression_cluster_sentence(dm: WBDataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene, api_manager: APIManager): expr_sentence_generator = OntologySentenceGenerator( gene_id=gene.id, module=Module.EXPRESSION, data_manager=dm, config=conf_parser) ec_gene_id = gene_desc.gene_id[3:] ec_anatomy_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.ANATOMY, feature=ExpressionClusterFeature.STUDIES) ec_anatomy_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.TERMS, expression_cluster_type=ExpressionClusterType.ANATOMY) if dm.expression_ontology is not None: expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences( aspect='A', qualifier="Enriched", merge_groups_with_same_prefix=True, keep_only_best_group=False) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description=expression_enriched_module_sentences.get_description(), additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) elif ec_anatomy_terms: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description="is enriched in " + concatenate_words_with_oxford_comma( ec_anatomy_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) ec_molreg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.MOLREG, feature=ExpressionClusterFeature.TERMS) ec_molreg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.MOLREG) ec_genereg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.GENEREG, feature=ExpressionClusterFeature.TERMS) ec_genereg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.GENEREG) if ec_genereg_terms: several_word = "" if len(ec_genereg_terms) > 3: t_p = [ t_p for t_p in sorted( [[term, api_manager.get_textpresso_popularity(term)] for term in ec_genereg_terms], key=lambda x: (x[1], x[0][1]), reverse=True) ] ec_genereg_terms = [term for term, popularity in t_p[0:3]] several_word = "several genes including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_GENE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_genereg_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_genereg_studies, additional_postfix_final_word="studies", use_single_form=True) if ec_molreg_terms: several_word = "" if len(ec_molreg_terms) > 3: several_word = num2words( len(ec_molreg_terms)) + " chemicals including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_MOLECULE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_molreg_terms[0:3], separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_molreg_studies, additional_postfix_final_word="studies", use_single_form=True)
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str, api_manager: APIManager, config: GenedescConfigParser): """build orthology sentence for WormBase non-human hortologs Args: orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol orthologs_sp_fullname (str): full name of species from which to extract orthologs api_manager (APIManager): api manager to send requests to wormbase and textpresso config (GenedescConfigParser): a gene description configuration object Returns: str: the orthology sentence """ orth_sentence = None if len(orthologs) > 0: fullname_arr = orthologs_sp_fullname.split(" ") if len(fullname_arr[0]) > 2: fullname_arr[0] = fullname_arr[0][0] + "." orthologs_sp_fullname = " ".join(fullname_arr) if len(orthologs) > 3: # sort orthologs by tpc popularity and alphabetically (if tied) orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for ortholog in orthologs], key=lambda x: (x[1], x[0][1]), reverse=True)] classes_orth_pop = defaultdict(list) orthologs_pop_wo_class = [] for o_p in orthologs_pop: gene_class = api_manager.get_gene_class(o_p[0][0]) if gene_class: classes_orth_pop[gene_class].append(o_p) else: orthologs_pop_wo_class.append(o_p) if len(list(classes_orth_pop.keys())) == 1: orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]]) classes_orth_pop = {} else: for gene_class, orths_with_pop in classes_orth_pop.items(): if len(orths_with_pop) == 1: orthologs_pop_wo_class.extend(orths_with_pop) classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1} sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class] sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()]) sorted_items.sort(key=lambda x: x[0][1], reverse=True) if len(sorted_items) > 3: sorted_items = sorted_items[0:3] gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0] classes_symbols = [item[2] for item in sorted_items if item[1] == 1] genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1] sentences_arr = [] if len(gene_symbols_wo_class) > 0: sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma( gene_symbols_wo_class, separator=config.get_terms_delimiter())) if len(classes_symbols) > 0: genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma( genes_symbols_in_classes, separator=config.get_terms_delimiter()) classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols, separator=config.get_terms_delimiter()) classes_word = "classes" if len(classes_symbols) > 1 else "class" sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent + " gene " + classes_word + " including " + genes_symbols_in_classes_sent) orth_sentence = "is an ortholog of " + " and ".join(sentences_arr) else: # sort orthologs alphabetically orthologs_symbols = sorted([orth[1] for orth in orthologs]) orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \ concatenate_words_with_oxford_comma(orthologs_symbols, separator=config.get_terms_delimiter()) return orth_sentence
def load_ontology_from_file(self, ontology_type: DataType, ontology_url: str, ontology_cache_path: str, config: GenedescConfigParser) -> None: """load go ontology from file Args: ontology_type (DataType): the type of ontology to set ontology_url (str): url to the ontology file ontology_cache_path (str): path to cache file for the ontology config (GenedescConfigParser): configuration object where to read properties """ new_ontology = None module = None slim_cache_path = "" if ontology_type == DataType.GO: logger.info("Loading GO ontology data from file") self.go_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology( relations=self.go_relations) new_ontology = self.go_ontology module = Module.GO slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "go_slim.obo") elif ontology_type == DataType.DO: logger.info("Loading DO ontology data from file") self.do_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology( relations=self.do_relations) new_ontology = self.do_ontology module = Module.DO_EXPERIMENTAL slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "do_slim.obo") elif ontology_type == DataType.EXPR: logger.info("Loading Expression ontology data from file") self.expression_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology() new_ontology = self.expression_ontology module = Module.EXPRESSION slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "exp_slim.obo") terms_replacement_regex = config.get_module_property( module=module, prop=ConfigModuleProperty.RENAME_TERMS) if terms_replacement_regex: self.rename_ontology_terms( ontology=new_ontology, terms_replacement_regex=terms_replacement_regex) if ontology_type == DataType.EXPR: DataManager.add_article_to_expression_nodes( self.expression_ontology) for root_id in new_ontology.get_roots(): set_all_depths_in_subgraph(ontology=new_ontology, root_id=root_id, relations=None) slim_url = config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_URL) self.load_slim(module=module, slim_url=slim_url, slim_cache_path=slim_cache_path)
def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)
def get_trimmed_terms_by_common_ancestor( self, terms: Set[str], terms_already_covered, aspect: str, config: GenedescConfigParser, high_priority_terms: List[str] = None): dist_root = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DISTANCE_FROM_ROOT) add_mul_common_anc = config.get_module_property( module=self.module, prop=ConfigModuleProperty.ADD_MULTIPLE_TO_COMMON_ANCEST) max_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE) trimming_algorithm = config.get_module_property( module=self.module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM) slim_set = self.data_manager.get_slim(module=self.module) slim_bonus_perc = config.get_module_property( module=self.module, prop=ConfigModuleProperty.SLIM_BONUS_PERC) add_others_highp = False add_others_lowp = False ancestors_covering_multiple_children = set() if not dist_root: dist_root = {'F': 1, 'P': 1, 'C': 2, 'D': 3, 'A': 3} terms_high_priority = [ term for term in terms if high_priority_terms and term in high_priority_terms ] if terms_high_priority is None: terms_high_priority = [] if len(terms_high_priority) > max_terms: terms_high_priority = self.remove_children_if_parents_present( terms_high_priority, self.ontology, terms_already_covered) if len(terms_high_priority) > max_terms: logger.debug( "Reached maximum number of terms. Applying trimming to high priority terms" ) terms_high_priority, add_others_highp = get_best_nodes( terms_high_priority, trimming_algorithm, max_terms, self.ontology, terms_already_covered, ancestors_covering_multiple_children if add_mul_common_anc else None, slim_bonus_perc, dist_root[aspect], slim_set, nodeids_blacklist=config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS)) else: terms_already_covered.update(terms_high_priority) terms_low_priority = [ term for term in terms if not high_priority_terms or term not in high_priority_terms ] trimming_threshold = max_terms - len(terms_high_priority) if 0 < trimming_threshold < len(terms_low_priority): terms_low_priority, add_others_lowp = get_best_nodes( terms_low_priority, trimming_algorithm, trimming_threshold, self.ontology, terms_already_covered, ancestors_covering_multiple_children if add_mul_common_anc else None, slim_bonus_perc, dist_root[aspect], slim_set, nodeids_blacklist=config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif trimming_threshold <= 0 < len(terms_low_priority): add_others_lowp = True terms = terms_high_priority terms_low_priority_orig = terms_low_priority[:] # remove exact overlap terms_low_priority = list( set(terms_low_priority) - set(terms_high_priority)) # remove possible children of terms in the high priority list terms_low_priority = list( set(terms_low_priority) | set(terms_high_priority)) terms_low_priority = OntologySentenceGenerator.remove_children_if_parents_present( terms_low_priority, self.ontology) # remove possible parents of terms in the high priority list terms_low_priority = list( set(terms_low_priority) | set(terms_high_priority)) terms_low_priority = OntologySentenceGenerator.remove_parents_if_child_present( terms_low_priority, self.ontology) terms_low_priority = list( set(terms_low_priority) - set(terms_high_priority)) if len(terms_low_priority) < len(terms_low_priority_orig): add_others_lowp = True terms.extend(terms_low_priority) # cutoff terms - if number of terms with high priority is higher than max_num_terms terms = terms[0:max_terms] return terms, add_others_highp or add_others_lowp, ancestors_covering_multiple_children