def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, ensembl_hgnc_ids_map, conf_parser: GenedescConfigParser, human_df_agr: DataManager, gene_desc: GeneDescription, dm: WBDataManager, gene: Gene): if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": best_orth = get_best_human_ortholog_for_info_poor(selected_orthologs, ensembl_hgnc_ids_map, conf_parser.get_annotations_priority(module=Module.GO), human_df_agr, config=conf_parser) if best_orth: if not best_orth.startswith("RGD:"): best_orth = "RGD:" + best_orth human_go_sent_generator = OntologySentenceGenerator(gene_id=best_orth, module=Module.GO, data_manager=human_df_agr, config=conf_parser, humans=False, limit_to_group="EXPERIMENTAL") human_func_module_sentences = human_go_sent_generator.get_module_sentences( config=conf_parser, aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[ best_orth] + " " + human_func_sent) protein_domains = dm.protein_domains[gene_desc.gene_id[3:]] if protein_domains: dom_word = "domain" if len(protein_domains) > 1: dom_word = "domains" gene_desc.set_or_extend_module_description_and_final_stats( module=Module.PROTEIN_DOMAIN, description="is predicted to encode a protein with the following " + dom_word + ": " + concatenate_words_with_oxford_comma([ptdom[1] if ptdom[1] != "" else ptdom[0] for ptdom in protein_domains]))
def generate_ortholog_sentence_wormbase_human( orthologs: List[List[str]], human_genes_props: Dict[str, List[str]]): """build orthology sentence for WormBase human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties Returns: Tuple[list, str]: the orthologs and the sentence """ prefix = "human " if len(orthologs) > 3: orthologs = orthologs[0:3] prefix = "several human genes including " symbol_name_arr = sorted([ human_genes_props[best_orth[0]][1] + " (" + human_genes_props[best_orth[0]][2] + ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else best_orth[1] for best_orth in orthologs ]) orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma( symbol_name_arr) return [ human_genes_props[best_orth[0]][1] for best_orth in orthologs if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] ], orth_sentence
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription, config: GenedescConfigParser, excluded_orthologs: bool = False): """set orthology module for Alliance human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name gene_desc (GeneDescription): the gene description object to update config (GenedescConfigParser): a gene descriptions configuration object excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the final sentence will include a prefix to specify that some orthologs have been omitted """ if len(orthologs) > 0: prefix = "human" orthologs_display = sorted(orthologs, key=lambda x: x[2]) if excluded_orthologs or len(orthologs) > 3: orthologs_display = orthologs_display[0:3] prefix = "several human genes including" sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma( [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display], separator=config.get_terms_delimiter()) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
def set_expression_cluster_sentence(dm: WBDataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene, api_manager: APIManager): expr_sentence_generator = OntologySentenceGenerator( gene_id=gene.id, module=Module.EXPRESSION, data_manager=dm, config=conf_parser) ec_gene_id = gene_desc.gene_id[3:] ec_anatomy_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.ANATOMY, feature=ExpressionClusterFeature.STUDIES) ec_anatomy_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.TERMS, expression_cluster_type=ExpressionClusterType.ANATOMY) if dm.expression_ontology is not None: expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences( aspect='A', qualifier="Enriched", merge_groups_with_same_prefix=True, keep_only_best_group=False) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description=expression_enriched_module_sentences.get_description(), additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) elif ec_anatomy_terms: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description="is enriched in " + concatenate_words_with_oxford_comma( ec_anatomy_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) ec_molreg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.MOLREG, feature=ExpressionClusterFeature.TERMS) ec_molreg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.MOLREG) ec_genereg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.GENEREG, feature=ExpressionClusterFeature.TERMS) ec_genereg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.GENEREG) if ec_genereg_terms: several_word = "" if len(ec_genereg_terms) > 3: t_p = [ t_p for t_p in sorted( [[term, api_manager.get_textpresso_popularity(term)] for term in ec_genereg_terms], key=lambda x: (x[1], x[0][1]), reverse=True) ] ec_genereg_terms = [term for term, popularity in t_p[0:3]] several_word = "several genes including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_GENE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_genereg_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_genereg_studies, additional_postfix_final_word="studies", use_single_form=True) if ec_molreg_terms: several_word = "" if len(ec_molreg_terms) > 3: several_word = num2words( len(ec_molreg_terms)) + " chemicals including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_MOLECULE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_molreg_terms[0:3], separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_molreg_studies, additional_postfix_final_word="studies", use_single_form=True)
def set_or_extend_module_description_and_final_stats(self, module: Module, module_sentences: ModuleSentences = None, description: str = None, additional_postfix_terms_list: List[str] = None, additional_postfix_final_word: str = None, use_single_form: bool = False): """set description text and stats for a specific module if previous data is present in the specified module, the provided description and the stats are merged with the existing ones Args: module (Module): the description module to update module_sentences (ModuleSentences): optional - module sentences object from which to take the description and stats description (str): optional - description text to be added additional_postfix_terms_list (List[str]): optional - list of terms to be merged and added as postfix to the description additional_postfix_final_word: optional - word to be added at the end of the postfix (automatically converted to plural if the list of terms has more than one element) use_single_form (bool): whether to use a single form for the final word without transforming it to plural """ desc = "" if module_sentences: desc = module_sentences.get_description() self.stats.trimmed = self.stats.trimmed or any([sent.trimmed for sent in module_sentences.sentences]) elif description: inflect_engine = inflect.engine() desc = description if additional_postfix_terms_list and len(additional_postfix_terms_list) > 0: desc += " " + concatenate_words_with_oxford_comma(additional_postfix_terms_list) + " " + \ (additional_postfix_final_word if use_single_form or len(additional_postfix_terms_list) == 1 else inflect_engine.plural_noun(additional_postfix_final_word)) if desc: if self.description and self.description != self.gene_name: self.description = self.description[0:-1] + "; " + desc + "." else: if not self.add_gene_name or not self.gene_name: desc = desc[0].upper() + desc[1:] self.description = self.gene_name + " " + desc + "." if self.add_gene_name else desc + "." if module == Module.GO_FUNCTION: self.go_function_description = self._concatenate_description(desc, self.go_function_description) self.stats.set_final_go_ids_f = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_go_ids_f) self.stats.set_final_experimental_go_ids_f = self._get_merged_ids(module_sentences.get_ids( experimental_only=True), self.stats.set_final_experimental_go_ids_f) elif module == Module.GO_PROCESS: self.go_process_description = self._concatenate_description(desc, self.go_process_description) self.stats.set_final_go_ids_p = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_go_ids_p) self.stats.set_final_experimental_go_ids_p = self._get_merged_ids(module_sentences.get_ids( experimental_only=True), self.stats.set_final_experimental_go_ids_p) elif module == Module.GO_COMPONENT: self.go_component_description = self._concatenate_description(desc, self.go_component_description) self.stats.set_final_go_ids_c = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_go_ids_c) self.stats.set_final_experimental_go_ids_c = self._get_merged_ids(module_sentences.get_ids( experimental_only=True), self.stats.set_final_experimental_go_ids_c) elif module == Module.EXPRESSION: self.tissue_expression_description = self._concatenate_description(desc, self.tissue_expression_description) self.stats.set_final_expression_ids = self._get_merged_ids( module_sentences.get_ids(experimental_only=False), self.stats.set_final_expression_ids) elif module == Module.EXPRESSION_CLUSTER_GENE: self.gene_expression_cluster_description = self._concatenate_description( desc, self.gene_expression_cluster_description) elif module == Module.EXPRESSION_CLUSTER_ANATOMY: self.anatomy_expression_cluster_description = self._concatenate_description( desc, self.anatomy_expression_cluster_description) elif module == Module.EXPRESSION_CLUSTER_MOLECULE: self.molecule_expression_cluster_description = self._concatenate_description( desc, self.molecule_expression_cluster_description) elif module == Module.DO_EXPERIMENTAL: self.do_experimental_description = self._concatenate_description(desc, self.do_experimental_description) self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_do_ids) elif module == Module.DO_BIOMARKER: self.do_biomarker_description = self._concatenate_description(desc, self.do_biomarker_description) self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_do_ids) elif module == Module.DO_ORTHOLOGY: self.do_orthology_description = self._concatenate_description(desc, self.do_orthology_description) self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False), self.stats.set_final_do_ids) elif module == Module.SISTER_SP: self.sister_species_description = self._concatenate_description(desc, self.sister_species_description) elif module == Module.ORTHOLOGY: self.orthology_description = self._concatenate_description(desc, self.orthology_description) elif module == Module.INFO_POOR_HUMAN_FUNCTION: self.human_gene_function_description = self._concatenate_description(desc, self.human_gene_function_description) elif module == Module.PROTEIN_DOMAIN: self.protein_domain_description = self._concatenate_description(desc, self.protein_domain_description) # Multimodule fields if module == Module.GO_PROCESS or module == Module.GO_FUNCTION or module == Module.GO_COMPONENT: self.go_description = self._merge_descriptions( [self.go_function_description, self.go_process_description, self.go_component_description]) if module == Module.DO_EXPERIMENTAL or module == Module.DO_BIOMARKER or module == Module.DO_ORTHOLOGY: self.do_description = self._merge_descriptions( [self.do_experimental_description, self.do_biomarker_description, self.do_orthology_description]) self.stats.number_final_do_term_covering_multiple_initial_do_terms = self.do_description.count( "(multiple)")
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str, api_manager: APIManager, config: GenedescConfigParser): """build orthology sentence for WormBase non-human hortologs Args: orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol orthologs_sp_fullname (str): full name of species from which to extract orthologs api_manager (APIManager): api manager to send requests to wormbase and textpresso config (GenedescConfigParser): a gene description configuration object Returns: str: the orthology sentence """ orth_sentence = None if len(orthologs) > 0: fullname_arr = orthologs_sp_fullname.split(" ") if len(fullname_arr[0]) > 2: fullname_arr[0] = fullname_arr[0][0] + "." orthologs_sp_fullname = " ".join(fullname_arr) if len(orthologs) > 3: # sort orthologs by tpc popularity and alphabetically (if tied) orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for ortholog in orthologs], key=lambda x: (x[1], x[0][1]), reverse=True)] classes_orth_pop = defaultdict(list) orthologs_pop_wo_class = [] for o_p in orthologs_pop: gene_class = api_manager.get_gene_class(o_p[0][0]) if gene_class: classes_orth_pop[gene_class].append(o_p) else: orthologs_pop_wo_class.append(o_p) if len(list(classes_orth_pop.keys())) == 1: orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]]) classes_orth_pop = {} else: for gene_class, orths_with_pop in classes_orth_pop.items(): if len(orths_with_pop) == 1: orthologs_pop_wo_class.extend(orths_with_pop) classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1} sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class] sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()]) sorted_items.sort(key=lambda x: x[0][1], reverse=True) if len(sorted_items) > 3: sorted_items = sorted_items[0:3] gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0] classes_symbols = [item[2] for item in sorted_items if item[1] == 1] genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1] sentences_arr = [] if len(gene_symbols_wo_class) > 0: sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma( gene_symbols_wo_class, separator=config.get_terms_delimiter())) if len(classes_symbols) > 0: genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma( genes_symbols_in_classes, separator=config.get_terms_delimiter()) classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols, separator=config.get_terms_delimiter()) classes_word = "classes" if len(classes_symbols) > 1 else "class" sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent + " gene " + classes_word + " including " + genes_symbols_in_classes_sent) orth_sentence = "is an ortholog of " + " and ".join(sentences_arr) else: # sort orthologs alphabetically orthologs_symbols = sorted([orth[1] for orth in orthologs]) orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \ concatenate_words_with_oxford_comma(orthologs_symbols, separator=config.get_terms_delimiter()) return orth_sentence