def set_associations(self, associations_type: DataType, associations: AssociationSet, config: GenedescConfigParser): """set the go annotations and remove blacklisted annotations Args: associations_type (DataType): the type of associations to set associations (AssociationSet): an association object to set as go annotations config (GenedescConfigParser): configuration object where to read properties """ if associations_type == DataType.GO: logger.info("Setting GO associations") self.go_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Setting DO associations") self.do_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Setting Expression associations") self.expression_associations = self.remove_blacklisted_annotations( association_set=associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
class TestConfigParser(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="INFO", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) def test_exclude_terms_list(self): self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0, "GO exclusion term list not loading") self.assertTrue(len(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS)) > 0, "DO terms exclusion not loading") def test_rename_terms(self): self.assertTrue(len(self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS)) == 7, "GO term renaming list not loading") self.assertTrue(self.conf_parser.get_module_property(module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.RENAME_TERMS) is None, "DO term renaming list should be None") def test_evidence_codes(self): self.assertTrue("EXP" in list(self.conf_parser.get_evidence_codes_groups_map(module=Module.GO).keys()))
def load_associations_from_file(self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser) -> None: """load go associations from file Args: associations_type (DataType): the type of associations to set associations_url (str): url to the association file associations_cache_path (str): path to cache file for the associations config (GenedescConfigParser): configuration object where to read properties """ assoc_config = AssocParserConfig(remove_double_prefixes=True, paint=True) if associations_type == DataType.GO: logger.info("Loading GO associations from file") self.go_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.go_ontology) self.go_associations = self.remove_blacklisted_annotations( association_set=self.go_associations, ontology=self.go_ontology, terms_blacklist=config.get_module_property( module=Module.GO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: logger.info("Loading DO associations from file") self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXP_AND_BIO, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.EXPR: logger.info("Loading Expression associations from file") self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=GafParser(config=assoc_config).parse( file=self._get_cached_file(cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def set_ontology(self, ontology_type: DataType, ontology: Ontology, config: GenedescConfigParser, slim_cache_path: str = None) -> None: """set the go ontology and apply terms renaming Args: ontology_type (DataType): the type of ontology to set ontology (Ontology): an ontology object to set as go ontology config (GenedescConfigParser): configuration object where to read properties slim_cache_path (str): path to slim file to use """ if ontology_type == DataType.GO: logger.info("Setting GO ontology") if self.go_relations: self.go_ontology = ontology.subontology( relations=self.go_relations) else: self.go_ontology = ontology elif ontology_type == DataType.DO: logger.info("Setting DO ontology") if self.do_relations: self.do_ontology = ontology.subontology( relations=self.do_relations) else: self.do_ontology = ontology elif ontology_type == DataType.EXPR: logger.info("Setting Expression ontology") if self.expr_relations: self.expression_ontology = ontology.subontology( relations=self.expr_relations) else: self.expression_ontology = ontology module = get_module_from_data_type(ontology_type) ontology = self.get_ontology(data_type=ontology_type) terms_replacement_regex = config.get_module_property( module=module, prop=ConfigModuleProperty.RENAME_TERMS) if terms_replacement_regex: self.rename_ontology_terms( ontology=ontology, terms_replacement_regex=terms_replacement_regex) set_all_depths(ontology=ontology, relations=self.get_relations(ontology_type)) if config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "ic": set_ic_ontology_struct(ontology=ontology, relations=self.get_relations(ontology_type)) if slim_cache_path: slim_url = config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_URL) self.load_slim(module=module, slim_url=slim_url, slim_cache_path=slim_cache_path)
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig( filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser( os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file( ontology_type=DataType.GO, ontology_url="file://" + os.path.join(self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file( associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join( self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue( any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue( len( self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority( module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue( all( len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property( module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): pass
def __init__(self, gene_id: str, module: Module, data_manager: DataManager, config: GenedescConfigParser, limit_to_group: str = None, humans: bool = False): """initialize sentence generator object Args: config (GenedescConfigParser): an optional config object from which to read the options limit_to_group (str): limit the evidence codes to the specified group """ self.ontology = data_manager.get_ontology( get_data_type_from_module(module)) self.config = config self.module = module self.terms_already_covered = set() self.terms_groups = defaultdict(lambda: defaultdict(set)) self.evidence_groups_priority_list = config.get_evidence_groups_priority_list( module=module) self.prepostfix_sentences_map = config.get_prepostfix_sentence_map( module=module, humans=humans) self.gene_annots = data_manager.get_annotations_for_gene( gene_id=gene_id, annot_type=get_data_type_from_module(module), priority_list=config.get_annotations_priority(module=module)) self.trimmer = CONF_TO_TRIMMING_CLASS[config.get_module_property( module=module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM)]( ontology=self.ontology, annotations=data_manager.get_associations( get_data_type_from_module(module)), nodeids_blacklist=config.get_module_property( module=module, prop=ConfigModuleProperty.EXCLUDE_TERMS), slim_terms_ic_bonus_perc=config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_BONUS_PERC), slim_set=data_manager.get_slim(module=module)) self.set_terms_groups(module, config, limit_to_group, humans)
def set_associations(self, associations_type: DataType, associations: AssociationSet, config: GenedescConfigParser): """set the go annotations and remove blacklisted annotations Args: associations_type (DataType): the type of associations to set associations (AssociationSet): an association object to set as go annotations config (GenedescConfigParser): configuration object where to read properties """ assocs = self.remap_associations( associations=associations, ontology=self.get_ontology(associations_type), associations_map=config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.REMAP_TERMS)) assocs = self.remove_blacklisted_annotations( association_set=assocs, ontology=self.get_ontology(associations_type), terms_blacklist=config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.EXCLUDE_TERMS)) if associations_type == DataType.GO: logger.info("Setting GO associations") self.go_associations = assocs elif associations_type == DataType.DO: logger.info("Setting DO associations") self.do_associations = assocs elif associations_type == DataType.EXPR: logger.info("Setting Expression associations") self.expression_associations = assocs if config.get_module_property( module=get_module_from_data_type(associations_type), prop=ConfigModuleProperty.TRIMMING_ALGORITHM) == "icGO": set_ic_annot_freq(self.get_ontology(associations_type), self.get_associations(associations_type))
class TestGOModule(unittest.TestCase): def setUp(self): logging.basicConfig(filename=None, level="ERROR", format='%(asctime)s - %(name)s - %(levelname)s: %(message)s') logger.info("Starting DataManager tests") self.this_dir = os.path.split(__file__)[0] self.conf_parser = GenedescConfigParser(os.path.join(self.this_dir, os.path.pardir, "tests", "config_test.yml")) self.df = DataManager(do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) logger.info("Loading go ontology from file") self.df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url="file://" + os.path.join( self.this_dir, "data", "go_gd_test.obo"), ontology_cache_path=os.path.join(self.this_dir, "cache", "go_gd_test.obo"), config=self.conf_parser) logger.info("Loading go associations from file") self.df.load_associations_from_file(associations_type=DataType.GO, associations_url="file://" + os.path.join( self.this_dir, "data", "gene_association_1.7.wb.partial"), associations_cache_path=os.path.join(self.this_dir, "cache", "gene_association_1.7.wb.partial"), config=self.conf_parser) def test_ontology_exists(self): self.assertTrue(self.df.go_ontology is not None) self.assertTrue(any(parent == "GO:0009987" for parent in self.df.go_ontology.parents("GO:0000075"))) def test_annotations_exist(self): self.assertTrue(self.df.go_associations is not None) self.assertTrue(len(self.df.get_annotations_for_gene( gene_id="WB:WBGene00000001", annot_type=DataType.GO, include_obsolete=False, include_negative_results=False, priority_list=self.conf_parser.get_annotations_priority(module=Module.GO))) > 0) def test_rename_terms(self): self.assertTrue(all(len(self.df.go_ontology.search(term)) == 0 for term in list( self.conf_parser.get_module_property(module=Module.GO, prop=ConfigModuleProperty.RENAME_TERMS).keys()))) def test_exclude_terms(self): test_annot = self.df.get_annotations_for_gene("WB:WBGene00000001", annot_type=DataType.GO) self.assertTrue(all([annot["object"]["id"] != "GO:0008286" for annot in test_annot])) def test_download_gz_file(self): test_file = self.df._get_cached_file(cache_path=os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz"), file_source_url="file://" + os.path.join( self.this_dir, "data", "c_elegans.PRJNA13758.WS273.geneIDs.txt.gz")) self.assertTrue(test_file == os.path.join(self.this_dir, "cache", "c_elegans.PRJNA13758.WS273.geneIDs.txt")) def test_gene_data_functions(self): self.df.set_gene_data(gene_data=[Gene("1", "gene1", True, False), Gene("2", "gene2", False, True), Gene("3", "gene3", False, False), Gene("4", "gene4", True, True)]) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=False)]) == 1) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=False)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=False, include_pseudo_genes=True)]) == 2) self.assertTrue(len([g for g in self.df.get_gene_data(include_dead_genes=True, include_pseudo_genes=True)]) == 4) def test_get_human_gene_props(self): human_gene_props = self.df.get_human_gene_props() self.assertTrue(len(human_gene_props) > 0) def test_get_ensembl_hgnc_ids_map(self): ensembl_hgnc_ids_map = self.df.get_ensembl_hgnc_ids_map() self.assertTrue(len(ensembl_hgnc_ids_map) > 0) def test_set_ontology(self): ontology = OntologyFactory().create() for i in range(4): ontology.add_node(i, 'node' + str(i)) ontology.add_parent(1, 0) ontology.add_parent(2, 0) ontology.add_parent(3, 0) self.df.set_ontology(ontology_type=DataType.GO, ontology=ontology, config=self.conf_parser) self.assertTrue(list(self.df.go_ontology.nodes()) == list(ontology.nodes())) def test_set_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0019901", "", "F", "EXP", None, "WB", "")) associations.append(DataManager.create_annotation_record("", "2", "b", "protein_coding", "001", "GO:0005515", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertTrue(self.df.go_associations) def test_remap_associations(self): associations = [] associations.append(DataManager.create_annotation_record("", "1", "a", "protein_coding", "001", "GO:0018996", "", "F", "EXP", None, "WB", "")) assocs = AssociationSetFactory().create_from_assocs(assocs=associations, ontology=self.df.go_ontology) self.df.set_associations(associations_type=DataType.GO, associations=assocs, config=self.conf_parser) self.assertEqual(self.df.go_associations.associations_by_subj["1"][0]["object"]["id"], "GO:0042303")
def load_associations_from_file( self, associations_type: DataType, associations_url: str, associations_cache_path: str, config: GenedescConfigParser, association_additional_url: str = None, association_additional_cache_path: str = None) -> None: logger.info("Loading associations from file") if associations_type == DataType.GO: super().load_associations_from_file( associations_type=associations_type, associations_url=associations_url, associations_cache_path=associations_cache_path, config=config) elif associations_type == DataType.EXPR: associations = [] file_path = self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url) for line in open(file_path): if not line.strip().startswith("!"): linearr = line.strip().split("\t") if self.expression_ontology.node(linearr[4]): gene_id = linearr[0] + ":" + linearr[1] qualifiers = linearr[3].split("|") if len( qualifiers ) == 0 or "Partial" in qualifiers or "Certain" in qualifiers: qualifiers = ["Verified"] associations.append( DataManager.create_annotation_record( line, gene_id, linearr[2], linearr[11], linearr[12], linearr[4], qualifiers, linearr[8], linearr[6], linearr[5].split("|"), linearr[14], linearr[13])) self.expression_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.expression_ontology) self.expression_associations = self.remove_blacklisted_annotations( association_set=self.expression_associations, ontology=self.expression_ontology, terms_blacklist=config.get_module_property( module=Module.EXPRESSION, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif associations_type == DataType.DO: self.do_associations = AssociationSetFactory().create_from_assocs( assocs=GafParser().parse(file=self._get_cached_file( cache_path=associations_cache_path, file_source_url=associations_url), skipheader=True), ontology=self.do_ontology) if association_additional_cache_path and association_additional_url: associations = [] for subj_associations in self.do_associations.associations_by_subj.values( ): for association in subj_associations: if association["evidence"]["type"] == "IEA": associations.append(association) file_path = self._get_cached_file( cache_path=association_additional_cache_path, file_source_url=association_additional_url) header = True for line in open(file_path): if not line.strip().startswith("!"): if not header: linearr = line.strip().split("\t") if self.do_ontology.node( linearr[10]) and linearr[16] != "IEA": gene_ids = [linearr[2]] if linearr[1] == "allele": gene_ids = linearr[4].split(",") for gene_id in gene_ids: associations.append( DataManager.create_annotation_record( line, gene_id, linearr[3], linearr[1], linearr[0], linearr[10], linearr[9].split("|"), "D", linearr[16], linearr[18].split("|"), linearr[20], linearr[19])) else: header = False self.do_associations = AssociationSetFactory( ).create_from_assocs(assocs=associations, ontology=self.do_ontology) self.do_associations = self.remove_blacklisted_annotations( association_set=self.do_associations, ontology=self.do_ontology, terms_blacklist=config.get_module_property( module=Module.DO_EXPERIMENTAL, prop=ConfigModuleProperty.EXCLUDE_TERMS))
def load_ontology_from_file(self, ontology_type: DataType, ontology_url: str, ontology_cache_path: str, config: GenedescConfigParser) -> None: """load go ontology from file Args: ontology_type (DataType): the type of ontology to set ontology_url (str): url to the ontology file ontology_cache_path (str): path to cache file for the ontology config (GenedescConfigParser): configuration object where to read properties """ new_ontology = None module = None slim_cache_path = "" if ontology_type == DataType.GO: logger.info("Loading GO ontology data from file") self.go_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology( relations=self.go_relations) new_ontology = self.go_ontology module = Module.GO slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "go_slim.obo") elif ontology_type == DataType.DO: logger.info("Loading DO ontology data from file") self.do_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology( relations=self.do_relations) new_ontology = self.do_ontology module = Module.DO_EXPERIMENTAL slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "do_slim.obo") elif ontology_type == DataType.EXPR: logger.info("Loading Expression ontology data from file") self.expression_ontology = OntologyFactory().create( self._get_cached_file( file_source_url=ontology_url, cache_path=ontology_cache_path)).subontology() new_ontology = self.expression_ontology module = Module.EXPRESSION slim_cache_path = os.path.join( os.path.dirname(os.path.normpath(ontology_cache_path)), "exp_slim.obo") terms_replacement_regex = config.get_module_property( module=module, prop=ConfigModuleProperty.RENAME_TERMS) if terms_replacement_regex: self.rename_ontology_terms( ontology=new_ontology, terms_replacement_regex=terms_replacement_regex) if ontology_type == DataType.EXPR: DataManager.add_article_to_expression_nodes( self.expression_ontology) for root_id in new_ontology.get_roots(): set_all_depths_in_subgraph(ontology=new_ontology, root_id=root_id, relations=None) slim_url = config.get_module_property( module=module, prop=ConfigModuleProperty.SLIM_URL) self.load_slim(module=module, slim_url=slim_url, slim_cache_path=slim_cache_path)
def get_trimmed_terms_by_common_ancestor( self, terms: Set[str], terms_already_covered, aspect: str, config: GenedescConfigParser, high_priority_terms: List[str] = None): dist_root = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DISTANCE_FROM_ROOT) add_mul_common_anc = config.get_module_property( module=self.module, prop=ConfigModuleProperty.ADD_MULTIPLE_TO_COMMON_ANCEST) max_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE) trimming_algorithm = config.get_module_property( module=self.module, prop=ConfigModuleProperty.TRIMMING_ALGORITHM) slim_set = self.data_manager.get_slim(module=self.module) slim_bonus_perc = config.get_module_property( module=self.module, prop=ConfigModuleProperty.SLIM_BONUS_PERC) add_others_highp = False add_others_lowp = False ancestors_covering_multiple_children = set() if not dist_root: dist_root = {'F': 1, 'P': 1, 'C': 2, 'D': 3, 'A': 3} terms_high_priority = [ term for term in terms if high_priority_terms and term in high_priority_terms ] if terms_high_priority is None: terms_high_priority = [] if len(terms_high_priority) > max_terms: terms_high_priority = self.remove_children_if_parents_present( terms_high_priority, self.ontology, terms_already_covered) if len(terms_high_priority) > max_terms: logger.debug( "Reached maximum number of terms. Applying trimming to high priority terms" ) terms_high_priority, add_others_highp = get_best_nodes( terms_high_priority, trimming_algorithm, max_terms, self.ontology, terms_already_covered, ancestors_covering_multiple_children if add_mul_common_anc else None, slim_bonus_perc, dist_root[aspect], slim_set, nodeids_blacklist=config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS)) else: terms_already_covered.update(terms_high_priority) terms_low_priority = [ term for term in terms if not high_priority_terms or term not in high_priority_terms ] trimming_threshold = max_terms - len(terms_high_priority) if 0 < trimming_threshold < len(terms_low_priority): terms_low_priority, add_others_lowp = get_best_nodes( terms_low_priority, trimming_algorithm, trimming_threshold, self.ontology, terms_already_covered, ancestors_covering_multiple_children if add_mul_common_anc else None, slim_bonus_perc, dist_root[aspect], slim_set, nodeids_blacklist=config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS)) elif trimming_threshold <= 0 < len(terms_low_priority): add_others_lowp = True terms = terms_high_priority terms_low_priority_orig = terms_low_priority[:] # remove exact overlap terms_low_priority = list( set(terms_low_priority) - set(terms_high_priority)) # remove possible children of terms in the high priority list terms_low_priority = list( set(terms_low_priority) | set(terms_high_priority)) terms_low_priority = OntologySentenceGenerator.remove_children_if_parents_present( terms_low_priority, self.ontology) # remove possible parents of terms in the high priority list terms_low_priority = list( set(terms_low_priority) | set(terms_high_priority)) terms_low_priority = OntologySentenceGenerator.remove_parents_if_child_present( terms_low_priority, self.ontology) terms_low_priority = list( set(terms_low_priority) - set(terms_high_priority)) if len(terms_low_priority) < len(terms_low_priority_orig): add_others_lowp = True terms.extend(terms_low_priority) # cutoff terms - if number of terms with high priority is higher than max_num_terms terms = terms[0:max_terms] return terms, add_others_highp or add_others_lowp, ancestors_covering_multiple_children
def get_module_sentences(self, config: GenedescConfigParser, aspect: str, qualifier: str = '', keep_only_best_group: bool = False, merge_groups_with_same_prefix: bool = False, high_priority_term_ids: List[str] = None): """generate description for a specific combination of aspect and qualifier Args: config (GenedescConfigParser): a configuration object from which to read properties aspect (str): a data type aspect qualifier (str): qualifier keep_only_best_group (bool): whether to get only the evidence group with highest priority and discard the other evidence groups merge_groups_with_same_prefix (bool): whether to merge the phrases for evidence groups with the same prefix high_priority_term_ids (List[str]): list of ids for terms that must always appear in the sentence with higher priority than the other terms. Trimming is not applied to these terms Returns: ModuleSentences: the module sentences """ cat_several_words = config.get_module_property( module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD) del_overlap = config.get_module_property( module=self.module, prop=ConfigModuleProperty.REMOVE_OVERLAP) remove_parents = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DEL_PARENTS_IF_CHILD) remove_child_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.DEL_CHILDREN_IF_PARENT) max_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.MAX_NUM_TERMS_IN_SENTENCE) exclude_terms = config.get_module_property( module=self.module, prop=ConfigModuleProperty.EXCLUDE_TERMS) cutoff_final_word = config.get_module_property( module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD) rename_cell = config.get_module_property( module=self.module, prop=ConfigModuleProperty.RENAME_CELL) if not cat_several_words: cat_several_words = { 'F': 'functions', 'P': 'processes', 'C': 'components', 'D': 'diseases', 'A': 'tissues' } sentences = [] terms_already_covered = set() evidence_group_priority = { eg: p for p, eg in enumerate(self.evidence_groups_priority_list) } for terms, evidence_group, priority in sorted( [(t, eg, evidence_group_priority[eg]) for eg, t in self.terms_groups[(aspect, qualifier)].items()], key=lambda x: x[2]): terms, trimmed, add_others, ancestors_covering_multiple_children = self.reduce_terms( terms, max_terms, aspect, config, del_overlap, terms_already_covered, exclude_terms, remove_parents, remove_child_terms, high_priority_term_ids) if (aspect, evidence_group, qualifier ) in self.prepostfix_sentences_map and len(terms) > 0: sentences.append( _get_single_sentence( node_ids=terms, ontology=self.ontology, aspect=aspect, evidence_group=evidence_group, qualifier=qualifier, prepostfix_sentences_map=self.prepostfix_sentences_map, terms_merged=False, trimmed=trimmed, add_others=add_others, truncate_others_generic_word=cutoff_final_word, truncate_others_aspect_words=cat_several_words, ancestors_with_multiple_children= ancestors_covering_multiple_children, rename_cell=rename_cell)) if keep_only_best_group: return ModuleSentences(sentences) if merge_groups_with_same_prefix: sentences = self.merge_sentences_with_same_prefix( sentences=sentences, remove_parent_terms=remove_parents, rename_cell=rename_cell, high_priority_term_ids=high_priority_term_ids) return ModuleSentences(sentences)