def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser)
def test_get_common_ancestors(self): self.load_go_ontology() generator = OntologySentenceGenerator(gene_id="WB:WBGene00000912", module=Module.GO, data_manager=self.df, config=self.conf_parser) node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"] common_ancestors = get_all_common_ancestors(node_ids, generator.ontology) self.assertTrue(len(common_ancestors) > 0, "Common ancestors not found") associations = [association for subj_associations in self.df.go_associations.associations_by_subj.values() for association in subj_associations] associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043055", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0061065", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043054", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043053", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) self.df.go_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.conf_parser.config["go_sentences_options"]["exclude_terms"].append("GO:0040024") generator = OntologySentenceGenerator(gene_id="WB:WBGene00003931", module=Module.GO, data_manager=self.df, config=self.conf_parser) node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"] common_ancestors = get_all_common_ancestors(node_ids, generator.ontology) self.assertTrue("GO:0040024" not in common_ancestors, "Common ancestors contain blacklisted term")
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue( any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue( len( self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority( module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue( all( len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property( module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): pass
def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations)
def load_do_ontology(self): logger.info("Starting Ontology Tools tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None) logger.info("Loading do ontology from file") logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join( self.this_dir, "data", "doid.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"), config=self.conf_parser)
def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def _load_expression_cluster_file( self, file_cache_path, file_url, load_into_data, add_to_expression_ontology_annotations: bool = False): expr_clust_file = self._get_cached_file(cache_path=file_cache_path, file_source_url=file_url) header = True associations = [] terms_ids_map = {} if add_to_expression_ontology_annotations: associations = [ association for subj_associations in self.expression_associations.associations_by_subj.values() for association in subj_associations ] terms_replacement_regex = self.config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.RENAME_TERMS) for line in open(expr_clust_file): if not header: linearr = line.strip().split("\t") load_into_data[linearr[0]] = linearr[1:] load_into_data[ linearr[0]][2] = WBDataManager.get_replaced_terms_arr( load_into_data[linearr[0]][2].split(","), terms_replacement_regex) if load_into_data[linearr[0]] and load_into_data[ linearr[0]][3]: load_into_data[linearr[0]][3] = [ word.replace(" study", "").replace(" analysis", "") for word in load_into_data[linearr[0]][3].split(",") ] if add_to_expression_ontology_annotations: for term in load_into_data[linearr[0]][2]: if term not in terms_ids_map: term_ids = self.expression_ontology.resolve_names( [term]) if term_ids: terms_ids_map[term] = term_ids[0] else: terms_ids_map[term] = None if term in terms_ids_map and terms_ids_map[term]: associations.append( DataManager.create_annotation_record( line, "WB:" + linearr[0], "", "gene", "", terms_ids_map[term], ["Enriched"], "A", "IDA", "", "", "")) else: header = False if add_to_expression_ontology_annotations: self.set_associations( DataType.EXPR, associations=AssociationSetFactory().create_from_assocs( assocs=associations, ontology=self.expression_ontology), config=self.config)
def load_data(organism, conf_parser: GenedescConfigParser): logger = logging.getLogger("WB Gene Description Pipeline - Data loader") sister_df = None df_agr = None organisms_info = conf_parser.get_wb_organisms_info() df = WBDataManager(species=organism, do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) if organism == "c_elegans": df_agr = DataManager(go_relations=["subClassOf", "BFO:0000050"], do_relations=None) df_agr.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=conf_parser.get_wb_human_orthologs_go_ontology(), ontology_cache_path=os.path.join(conf_parser.get_cache_dir(), "wormbase_agr_human", "go_ontology.obo"), config=conf_parser) df_agr.load_associations_from_file(associations_type=DataType.GO, associations_url=conf_parser.get_wb_human_orthologs_go_associations(), associations_cache_path=os.path.join( conf_parser.get_cache_dir(), "wormbase_agr_human", "go_assoc.daf.gz"), config=conf_parser) if "main_sister_species" in organisms_info[organism] and organisms_info[organism]["main_sister_species"]: sister_df = WBDataManager(species=organisms_info[organism]["main_sister_species"], do_relations=None, go_relations=["subClassOf", "BFO:0000050"], config=conf_parser) logger.info("Loading GO data for sister species") sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url, ontology_cache_path=sister_df.go_ontology_cache_path, config=conf_parser) sister_df.load_associations_from_file(associations_type=DataType.GO, associations_url=sister_df.go_associations_url, associations_cache_path=sister_df.go_associations_cache_path, config=conf_parser) logger.info("Loading all data for main species") df.load_all_data_from_file() return df, sister_df, df_agr
class TestDescriptionsGenerator(unittest.TestCase): def setUp(self): logger.info("Starting Ontology Tools tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.fb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.fb.partial"), config=self.conf_parser) logging.basicConfig(filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') def test_set_or_extend_module_description_and_final_stats(self): gene_desc = GeneDescription(gene_id="FB:FBgn0027655", gene_name="Test gene", add_gene_name=False, config=self.conf_parser) go_sent_generator = OntologySentenceGenerator(gene_id="FB:FBgn0027655", module=Module.GO, data_manager=self.df, config=self.conf_parser) sentences = go_sent_generator.get_module_sentences(aspect='P', qualifier='', merge_groups_with_same_prefix=True, keep_only_best_group=True) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.GO_PROCESS, module_sentences=sentences) self.assertTrue(gene_desc.description, "Is involved in several processes, including axo-dendritic transport, " "establishment of mitotic spindle orientation, and positive regulation " "of extent of heterochromatin assembly") gene_desc = GeneDescription(gene_id="FB:FBgn0027655", gene_name="Test gene", add_gene_name=True, config=self.conf_parser) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.GO_PROCESS, module_sentences=sentences) self.assertTrue(gene_desc.description, "Test gene is involved in several processes, including axo-dendritic " "transport, establishment of mitotic spindle orientation, and positive " "regulation of extent of heterochromatin assembly")
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ self.ontology = data_manager.get_ontology( get_data_type_from_module(module)) self.config = config self.module = module self.terms_already_covered = set() self.terms_groups = defaultdict(lambda: defaultdict(set)) self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.gene_annots = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=get_data_type_from_module(module), priority_list=config.get_annotations_priority(module=module)) self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)]( ontology=self.ontology, annotations=data_manager.get_associations( get_data_type_from_module(module)), nodeids_blacklist=config.get_module_property( module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS), slim_terms_ic_bonus_perc=config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC), slim_set=data_manager.get_slim(module=module)) self.set_terms_groups(module, config, limit_to_group, humans)
def _load_and_process_data(self): # create gene descriptions data manager and load common data context_info = ContextInfo() data_manager = DataFileManager(context_info.config_file_location) #go_onto_config = data_manager.get_config('GO') go_annot_config = data_manager.get_config('GAF') #do_onto_config = data_manager.get_config('DOID') go_annot_sub_dict = {sub.get_data_provider(): sub for sub in go_annot_config.get_sub_type_objects()} this_dir = os.path.split(__file__)[0] gd_config = GenedescConfigParser(os.path.join(this_dir, os.pardir, os.pardir, "gene_descriptions.yml")) gd_data_manager = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) gd_data_manager.set_ontology(ontology_type=DataType.GO, ontology=self.get_ontology(data_type=DataType.GO), config=gd_config) gd_data_manager.set_ontology(ontology_type=DataType.DO, ontology=self.get_ontology(data_type=DataType.DO), config=gd_config) # generate descriptions for each MOD for prvdr in [sub_type.get_data_provider().upper() \ for sub_type in self.data_type_config.get_sub_type_objects()]: gd_config_mod_specific = copy.deepcopy(gd_config) if prvdr == "WB": gd_config_mod_specific.config["expression_sentences_options"][ "remove_children_if_parent_is_present"] = True self.logger.info("Generating gene descriptions for %s", prvdr) data_provider = prvdr if prvdr != "HUMAN" else "RGD" json_desc_writer = DescriptionsWriter() go_annot_path = "file://" + os.path.join(os.getcwd(), "tmp", go_annot_sub_dict[prvdr].file_to_download) gd_data_manager.load_associations_from_file( associations_type=DataType.GO, associations_url=go_annot_path, associations_cache_path=os.path.join(os.getcwd(), "tmp", "gd_cache", "go_annot_" + prvdr + ".gaf"), config=gd_config_mod_specific) gd_data_manager.set_associations(associations_type=DataType.DO, associations=self.get_disease_annotations_from_db( data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) if prvdr in EXPRESSION_PRVD_SUBTYPE_MAP: gd_data_manager.set_ontology(ontology_type=DataType.EXPR, ontology=self.get_ontology(data_type=DataType.EXPR, provider=prvdr), config=gd_config_mod_specific) gd_data_manager.set_associations( associations_type=DataType.EXPR, associations=self.get_expression_annotations_from_db(data_provider=data_provider, gd_data_manager=gd_data_manager, logger=self.logger), config=gd_config_mod_specific) commit_size = self.data_type_config.get_neo4j_commit_size() generators = self.get_generators(prvdr, gd_data_manager, gd_config_mod_specific, json_desc_writer) query_template_list = [ [self.gene_descriptions_query_template, commit_size, "genedescriptions_data_" + prvdr + ".csv"] ] query_and_file_list = self.process_query_params(query_template_list) CSVTransactor.save_file_static(generators, query_and_file_list) Neo4jTransactor.execute_query_batch(query_and_file_list) self.save_descriptions_report_files(data_provider=prvdr, json_desc_writer=json_desc_writer, context_info=context_info, gd_data_manager=gd_data_manager)
class TestOntologyTools(unittest.TestCase): def load_go_ontology(self): logger.info("Starting Ontology Tools tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def load_do_ontology(self): logger.info("Starting Ontology Tools tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None) logger.info("Loading do ontology from file") logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.DO, ontology_url="file://" + os.path.join( self.this_dir, "data", "doid.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "doid.obo"), config=self.conf_parser) def test_get_common_ancestors(self): self.load_go_ontology() generator = OntologySentenceGenerator(gene_id="WB:WBGene00000912", module=Module.GO, data_manager=self.df, config=self.conf_parser) node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"] common_ancestors = get_all_common_ancestors(node_ids, generator.ontology) self.assertTrue(len(common_ancestors) > 0, "Common ancestors not found") associations = [association for subj_associations in self.df.go_associations.associations_by_subj.values() for association in subj_associations] associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043055", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0061065", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043054", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) associations.append(DataManager.create_annotation_record(source_line="", gene_id="WB:WBGene00003931", gene_symbol="", gene_type="gene", taxon_id="", object_id="GO:0043053", qualifiers="", aspect="P", ecode="EXP", references="", prvdr="WB", date="")) self.df.go_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.conf_parser.config["go_sentences_options"]["exclude_terms"].append("GO:0040024") generator = OntologySentenceGenerator(gene_id="WB:WBGene00003931", module=Module.GO, data_manager=self.df, config=self.conf_parser) node_ids = generator.terms_groups[('P', '')]["EXPERIMENTAL"] common_ancestors = get_all_common_ancestors(node_ids, generator.ontology) self.assertTrue("GO:0040024" not in common_ancestors, "Common ancestors contain blacklisted term") def test_information_content(self): self.load_go_ontology() set_all_information_content_values(ontology=self.df.go_ontology) roots = self.df.go_ontology.get_roots() for root_id in roots: self.assertTrue(self.df.go_ontology.node(root_id)["IC"] == 0, "Root IC not equal to 0") def test_find_set_covering(self): subsets = [("1", "1", {"A", "B", "C"}), ("2", "2", {"A", "B"}), ("3", "3", {"C"}), ("4", "4", {"A"}), ("5", "5", {"B"}), ("6", "6", {"C"})] values = [2, 12, 5, 20, 20, 20] # test with weights set_covering = [best_set[0] for best_set in find_set_covering(subsets=subsets, value=values, max_num_subsets=3)] self.assertTrue("2" in set_covering) self.assertTrue("6" in set_covering) self.assertTrue("1" not in set_covering) self.assertTrue("3" not in set_covering) self.assertTrue("4" not in set_covering) self.assertTrue("5" not in set_covering) # test without weights set_covering_noweights = [best_set[0] for best_set in find_set_covering(subsets=subsets, value=None, max_num_subsets=3)] self.assertTrue("1" in set_covering_noweights and len(set_covering_noweights) == 1) # test wrong input costs_wrong = [1, 3] set_covering_wrong = find_set_covering(subsets=subsets, value=costs_wrong, max_num_subsets=3) self.assertTrue(set_covering_wrong is None, "Cost vector with length different than subsets should return None") subsets = [("1", "1", {"7"}), ("2", "2", {"7", "12", "13"}), ("3", "3", {"16", "17"}), ("4", "4", {"11"}), ("6", "6", {"12", "13"}), ("7", "7", {"7"}), ("9", "9", {"16", "17"}), ("11", "11", {"11"}), ("12", "12", {"12"}), ("13", "13", {"13"}), ("16", "16", {"16"}), ("17", "17", {"17"})] values = [1, 1, 0.875061263, 1.301029996, 1.301029996, 1.602059991, 1.301029996, 1.698970004, 1.698970004, 1.698970004, 1.698970004, 1.698970004] set_covering = [best_set[0] for best_set in find_set_covering(subsets=subsets, value=values, max_num_subsets=3)] self.assertTrue(all([num in set_covering for num in ["2", "9", "11"]])) def test_set_covering_with_ontology(self): self.load_do_ontology() self.conf_parser.config["do_via_orth_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["do_via_orth_sentences_options"]["max_num_terms"] = 5 associations = [DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080028", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080056", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:14789", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080026", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:14415", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080045", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:3371", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:8886", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:674", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:5614", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:11830", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:8398", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:2256", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:5327", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:1123", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date="")] self.df.do_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.do_ontology) generator = OntologySentenceGenerator(gene_id="MGI:88452", module=Module.DO_ORTHOLOGY, data_manager=self.df, config=self.conf_parser) sentences = generator.get_module_sentences( config=self.conf_parser, aspect='D', qualifier='', merge_groups_with_same_prefix=True, keep_only_best_group=True, high_priority_term_ids=["DOID:0080028", "DOID:0080056", "DOID:14789", "DOID:0080026", "DOID:14415", "DOID:0080045"]) print(sentences.get_description())
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue(any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue(len(self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO) self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot])) def test_download_gz_file(self): test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"), file_source_url="file://" + os.path.join( self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz")) self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt")) def test_gene_data_functions(self): self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True), Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)]) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=False)]) == 1) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=False)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=True)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=True)]) == 4) def test_get_human_gene_props(self): human_gene_props = self.df.get_human_gene_props() self.assertTrue(len(human_gene_props) > 0) def test_get_ensembl_hgnc_ids_map(self): ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map() self.assertTrue(len(ensembl_hgnc_ids_map) > 0) def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes())) def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations) def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def test_set_covering_with_ontology(self): self.load_do_ontology() self.conf_parser.config["do_via_orth_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["do_via_orth_sentences_options"]["max_num_terms"] = 5 associations = [DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080028", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080056", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:14789", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080026", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:14415", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:0080045", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:3371", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:8886", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:674", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:5614", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:11830", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:8398", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:2256", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:5327", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date=""), DataManager.create_annotation_record(source_line="", gene_id="MGI:88452", gene_symbol="", gene_type="gene", taxon_id="", object_id="DOID:1123", qualifiers="", aspect="D", ecode="ISS", references="", prvdr="WB", date="")] self.df.do_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.do_ontology) generator = OntologySentenceGenerator(gene_id="MGI:88452", module=Module.DO_ORTHOLOGY, data_manager=self.df, config=self.conf_parser) sentences = generator.get_module_sentences( config=self.conf_parser, aspect='D', qualifier='', merge_groups_with_same_prefix=True, keep_only_best_group=True, high_priority_term_ids=["DOID:0080028", "DOID:0080056", "DOID:14789", "DOID:0080026", "DOID:14415", "DOID:0080045"]) print(sentences.get_description())
def load_associations_from_file( self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser, association_additional_url: str = None, association_additional_cache_path: str = None) -> None: logger.info("Loading associations from file") if associations_type == DataType.GO: super().load_associations_from_file( associations_type=associations_type, associations_url=associations_url, associations_cache_path=associations_cache_path, config=config) elif associations_type == DataType.EXPR: associations = [] file_path = self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url) for line in open(file_path): if not line.strip().startswith("!"): linearr = line.strip().split("\t") if self.expression_ontology.node(linearr[4]): gene_id = linearr[0] + ":" + linearr[1] qualifiers = linearr[3].split("|") if len( qualifiers ) == 0 or "Partial" in qualifiers or "Certain" in qualifiers: qualifiers = ["Verified"] associations.append( DataManager.create_annotation_record( line, gene_id, linearr[2], linearr[11], linearr[12], linearr[4], qualifiers, linearr[8], linearr[6], linearr[5].split("|"), linearr[14], linearr[13])) self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser().parse(file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) if association_additional_cache_path and association_additional_url: associations = [] for subj_associations in self.do_associations.associations_by_subj.values( ): for association in subj_associations: if association["evidence"]["type"] == "IEA": associations.append(association) file_path = self._get_cached_file( cache_path=association_additional_cache_path, file_source_url=association_additional_url) header = True for line in open(file_path): if not line.strip().startswith("!"): if not header: linearr = line.strip().split("\t") if self.do_ontology.node( linearr[10]) and linearr[16] != "IEA": gene_ids = [linearr[2]] if linearr[1] == "allele": gene_ids = linearr[4].split(",") for gene_id in gene_ids: associations.append( DataManager.create_annotation_record( line, gene_id, linearr[3], linearr[1], linearr[0], linearr[10], linearr[9].split("|"), "D", linearr[16], linearr[18].split("|"), linearr[20], linearr[19])) else: header = False self.do_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def main(): parser = argparse.ArgumentParser( description="Generate gene descriptions for wormbase") parser.add_argument("-c", "--config-file", metavar="config_file", dest="config_file", type=str, default="config.yml", help="configuration file. Default ./config.yaml") parser.add_argument( "-C", "--use-cache", dest="use_cache", action="store_true", default=False, help= "Use cached source files from cache_location specified in config file. Download them from " "raw_file_source (configured in config file) if not yet cached") parser.add_argument( "-l", "--log-file", metavar="log_file", dest="log_file", type=str, default=None, help="path to the log file to generate. Default ./genedescriptions.log" ) parser.add_argument( "-L", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="set the logging level") parser.add_argument("-t", "--textpressoapi-token", metavar="textpresso_token", dest="textpresso_token", type=str, help="Texpresso api token") parser.add_argument("-o", "--output-formats", metavar="output_formats", dest="output_formats", type=str, nargs="+", default=["ace", "txt", "json", "tsv"], help="file formats to generate. Accepted values " "are: ace, txt, json, tsv") args = parser.parse_args() conf_parser = GenedescConfigParser(args.config_file) logging.basicConfig(filename=args.log_file, level=args.log_level, format='%(asctime)s - %(name)s - %(levelname)s:' '%(message)s', force=True) logger = logging.getLogger("WB Gene Description Pipeline") organisms_list = conf_parser.get_wb_organisms_to_process() human_genes_props = DataManager.get_human_gene_props() api_manager = APIManager(textpresso_api_token=args.textpresso_token) for organism in organisms_list: logger.info("Processing organism " + organism) species = conf_parser.get_wb_organisms_info() dm, sister_df, df_agr = load_data(organism=organism, conf_parser=conf_parser) desc_writer = DescriptionsWriter() desc_writer.overall_properties.species = organism desc_writer.overall_properties.release_version = conf_parser.get_wb_release( )[0:-1] + str(int(conf_parser.get_wb_release()[-1]) + 1) desc_writer.overall_properties.date = datetime.date.today().strftime( "%B %d, %Y") for gene in dm.get_gene_data(): logger.debug("Generating description for gene " + gene.name) gene_desc = GeneDescription(gene_id=gene.id, config=conf_parser, gene_name=gene.name, add_gene_name=False) selected_orthologs, orth_sent = get_best_orthologs_and_sentence( dm=dm, orth_fullnames=dm.orth_fullnames, human_genes_props=human_genes_props, gene_desc=gene_desc, api_manager=api_manager, config=conf_parser) set_gene_ontology_module(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene) set_tissue_expression_sentence(dm=dm, gene=gene, conf_parser=conf_parser, gene_desc=gene_desc) if not gene_desc.description: set_expression_cluster_sentence(dm=dm, conf_parser=conf_parser, gene_desc=gene_desc, gene=gene, api_manager=api_manager) set_disease_module(df=dm, conf_parser=conf_parser, gene=gene, gene_desc=gene_desc) if not gene_desc.go_description: set_information_poor_sentence( orth_fullnames=dm.orth_fullnames, selected_orthologs=selected_orthologs, conf_parser=conf_parser, human_df_agr=df_agr, gene_desc=gene_desc, dm=dm, gene=gene) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.ORTHOLOGY, description=orth_sent) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"] and \ dm.get_best_orthologs_for_gene(gene.id, orth_species_full_name=[dm.sister_sp_fullname], sister_species_data_fetcher=sister_df, ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"])[0]: set_sister_species_sentence( dm=dm, sister_sp_fullname=dm.sister_sp_fullname, sister_df=sister_df, species=species, organism=organism, gene_desc=gene_desc, conf_parser=conf_parser, gene=gene) desc_writer.add_gene_desc(gene_desc) logger.info("All genes processed for " + organism) date_prefix = datetime.date.today().strftime("%Y%m%d") if "json" in args.output_formats: logger.info("Writing descriptions to json") desc_writer.write_json(os.path.join( conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"), include_single_gene_stats=True, data_manager=dm) if "txt" in args.output_formats: logger.info("Writing descriptions to txt") desc_writer.write_plain_text( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt")) if "tsv" in args.output_formats: logger.info("Writing descriptions to tsv") desc_writer.write_tsv( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".tsv")) if "ace" in args.output_formats: logger.info("Writing descriptions to ace") curators = ["WBPerson324", "WBPerson37462"] release_version = conf_parser.get_wb_release() desc_writer.write_ace( os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".ace"), curators, release_version)
class TestOntologyTools(unittest.TestCase): def setUp(self): self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) @staticmethod def get_associations(gene_id, term_ids, qualifiers, aspect, ecode): return [DataManager.create_annotation_record(source_line="", gene_id=gene_id, gene_symbol="", gene_type="gene", taxon_id="", object_id=term_id, qualifiers=qualifiers, aspect=aspect, ecode=ecode, references="", prvdr="WB", date="") for term_id in term_ids] def test_trimming_lca(self): self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene = Gene(id="WB:WBGene00000018", name="abl-1", dead=False, pseudo=False) self.df.load_ontology_from_file(ontology_type=DataType.EXPR, ontology_url="file://" + os.path.join( self.this_dir, "data", "anatomy_ontology.WS274.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "anatomy_ontology.WS274.obo"), config=self.conf_parser) logger.info("Loading expression associations from file") self.conf_parser.config["expression_sentences_options"]["max_num_terms"] = 5 self.conf_parser.config["expression_sentences_options"]["trim_min_distance_from_root"]["A"] = 4 self.conf_parser.config["expression_sentences_options"]["remove_children_if_parent_is_present"] = False associations = self.get_associations(gene.id, ["WBbt:0006796", "WBbt:0006759", "WBbt:0005300", "WBbt:0008598", "WBbt:0003681", "WBbt:0005829", "WBbt:0003927", "WBbt:0006751"], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.expression_ontology) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" set_ic_ontology_struct(ontology=self.df.go_ontology, relations=self.df.go_relations) set_ic_ontology_struct(ontology=self.df.expression_ontology, relations=self.df.expr_relations) gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="abl-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "1") self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene = Gene(id="WB:WBGene00000022", name="aat-1", dead=False, pseudo=False) associations = self.get_associations(gene.id, ["WBbt:0005828", "WBbt:0006751", "WBbt:0005439", "WBbt:0005788", "WBbt:0006749", "WBbt:0005300", "WBbt:0005735", "WBbt:0005747", "WBbt:0005772", "WBbt:0005776", "WBbt:0005812", "WBbt:0005741", "WBbt:0005799", "WBbt:0003681"], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs( assocs=associations, ontology=self.df.expression_ontology) gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="aat-1", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "2") self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "lca" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "lca" gene = Gene(id="WB:WBGene00000044", name="acr-5", dead=False, pseudo=False) associations = self.get_associations(gene.id, ['WBbt:0003679', 'WBbt:0006759', 'WBbt:0005336', 'WBbt:0006751', 'WBbt:0005300', 'WBbt:0005274', 'WBbt:0005741', 'WBbt:0006749', 'WBbt:0005735'], ["Verified"], "A", "IDA") self.df.expression_associations = AssociationSetFactory().create_from_assocs( assocs=associations, ontology=self.df.expression_ontology) gene_desc_lca = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_lca, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_lca, gene) gene_desc_lca.stats.calculate_stats(data_manager=self.df) self.conf_parser.config["go_sentences_options"]["trimming_algorithm"] = "ic" self.conf_parser.config["expression_sentences_options"]["trimming_algorithm"] = "ic" gene_desc_ic = GeneDescription(gene_id=gene.id, config=self.conf_parser, gene_name="acr-5", add_gene_name=False) set_gene_ontology_module(dm=self.df, conf_parser=self.conf_parser, gene_desc=gene_desc_ic, gene=gene) set_expression_module(self.df, self.conf_parser, gene_desc_ic, gene) gene_desc_ic.stats.calculate_stats(data_manager=self.df) self.assertTrue(gene_desc_lca.stats.coverage_percentage >= gene_desc_ic.stats.coverage_percentage, "3")
def get_associations(gene_id, term_ids, qualifiers, aspect, ecode): return [DataManager.create_annotation_record(source_line="", gene_id=gene_id, gene_symbol="", gene_type="gene", taxon_id="", object_id=term_id, qualifiers=qualifiers, aspect=aspect, ecode=ecode, references="", prvdr="WB", date="") for term_id in term_ids]
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ annot_type = None if module == Module.DO_ORTHOLOGY or module == Module.DO_EXPERIMENTAL or module == module.DO_BIOMARKER: self.ontology = data_manager.do_ontology annot_type = DataType.DO elif module == Module.GO: self.ontology = data_manager.go_ontology annot_type = DataType.GO elif module == Module.EXPRESSION: self.ontology = data_manager.expression_ontology annot_type = DataType.EXPR self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.terms_groups = defaultdict(lambda: defaultdict(set)) ev_codes_groups_maps = config.get_evidence_codes_groups_map( module=module) annotations = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=annot_type, priority_list=config.get_annotations_priority(module=module)) self.annotations = annotations self.module = module self.data_manager = data_manager self.annot_type = annot_type evidence_codes_groups_map = { evcode: group for evcode, group in ev_codes_groups_maps.items() if limit_to_group is None or limit_to_group in ev_codes_groups_maps[evcode] } prepostfix_special_cases_sent_map = config.get_prepostfix_sentence_map( module=module, special_cases_only=True, humans=humans) if len(annotations) > 0: for annotation in annotations: if annotation["evidence"]["type"] in evidence_codes_groups_map: aspect = annotation["aspect"] ev_group = evidence_codes_groups_map[annotation["evidence"] ["type"]] qualifier = "_".join( sorted(annotation["qualifiers"] )) if "qualifiers" in annotation else "" if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \ prepostfix_special_cases_sent_map: for special_case in prepostfix_special_cases_sent_map[( aspect, ev_group, qualifier)]: if re.match( re.escape(special_case[1]), self.ontology.label( annotation["object"]["id"], id_if_null=True)): ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \ str(special_case[0]) if ev_group not in self.evidence_groups_priority_list: self.evidence_groups_priority_list.insert( self.evidence_groups_priority_list. index(evidence_codes_groups_map[ annotation["evidence"]["type"]]) + 1, ev_group) break self.terms_groups[(aspect, qualifier)][ev_group].add( annotation["object"]["id"])