def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, conf_parser: GenedescConfigParser, human_df_agr: DataManager, gene_desc: GeneDescription, dm: WBDataManager, gene: Gene): if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens": best_orth = get_best_human_ortholog_for_info_poor( selected_orthologs, conf_parser.get_annotations_priority(module=Module.GO), human_df_agr, config=conf_parser) if best_orth: if not best_orth.startswith("RGD:"): best_orth = "RGD:" + best_orth human_go_sent_generator = OntologySentenceGenerator( gene_id=best_orth, module=Module.GO, data_manager=human_df_agr, config=conf_parser, humans=False, limit_to_group="EXPERIMENTAL") human_func_module_sentences = human_go_sent_generator.get_module_sentences( aspect='F', qualifier="contributes_to", merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[best_orth] + " " + human_func_sent) human_func_module_sentences = human_go_sent_generator.get_module_sentences( aspect='F', qualifier="enables", merge_groups_with_same_prefix=True, keep_only_best_group=True) human_func_sent = human_func_module_sentences.get_description() if human_func_sent: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " + human_df_agr.go_associations.subject_label_map[best_orth] + " " + human_func_sent) protein_domains = dm.protein_domains[gene_desc.gene_id[3:]] if protein_domains: dom_word = "domain" if len([ptdom[1] for ptdom in protein_domains if ptdom[1] != ""]) > 1: dom_word = "domains" gene_desc.set_or_extend_module_description_and_final_stats( module=Module.PROTEIN_DOMAIN, description="is predicted to encode a protein with the following " + dom_word + ": " + concatenate_words_with_oxford_comma( [ptdom[1] for ptdom in protein_domains if ptdom[1] != ""], separator=conf_parser.get_terms_delimiter()))
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription, config: GenedescConfigParser, excluded_orthologs: bool = False): """set orthology module for Alliance human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name gene_desc (GeneDescription): the gene description object to update config (GenedescConfigParser): a gene descriptions configuration object excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the final sentence will include a prefix to specify that some orthologs have been omitted """ if len(orthologs) > 0: prefix = "human" orthologs_display = sorted(orthologs, key=lambda x: x[2]) if excluded_orthologs or len(orthologs) > 3: orthologs_display = orthologs_display[0:3] prefix = "several human genes including" sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma( [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display], separator=config.get_terms_delimiter()) gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
def generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human_genes_props: Dict[str, List[str]], config: GenedescConfigParser): """build orthology sentence for WormBase human orthologs Args: orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties config (GenedescConfigParser): a gene description configuration object Returns: Tuple[list, str]: the orthologs and the sentence """ prefix = "human " if len(orthologs) > 3: orthologs = orthologs[0:3] prefix = "several human genes including " symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] + ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else best_orth[1] for best_orth in orthologs]) orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma( symbol_name_arr, separator=config.get_terms_delimiter()) return [human_genes_props[best_orth[0]][0] for best_orth in orthologs if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]]], orth_sentence
def set_expression_cluster_sentence(dm: WBDataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene, api_manager: APIManager): expr_sentence_generator = OntologySentenceGenerator( gene_id=gene.id, module=Module.EXPRESSION, data_manager=dm, config=conf_parser) ec_gene_id = gene_desc.gene_id[3:] ec_anatomy_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.ANATOMY, feature=ExpressionClusterFeature.STUDIES) ec_anatomy_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.TERMS, expression_cluster_type=ExpressionClusterType.ANATOMY) if dm.expression_ontology is not None: expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences( aspect='A', qualifier="Enriched", merge_groups_with_same_prefix=True, keep_only_best_group=False) gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description=expression_enriched_module_sentences.get_description(), additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) elif ec_anatomy_terms: gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_ANATOMY, description="is enriched in " + concatenate_words_with_oxford_comma( ec_anatomy_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_anatomy_studies, additional_postfix_final_word="studies", use_single_form=True) ec_molreg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.MOLREG, feature=ExpressionClusterFeature.TERMS) ec_molreg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.MOLREG) ec_genereg_terms = dm.get_expression_cluster_feature( gene_id=ec_gene_id, expression_cluster_type=ExpressionClusterType.GENEREG, feature=ExpressionClusterFeature.TERMS) ec_genereg_studies = dm.get_expression_cluster_feature( gene_id=ec_gene_id, feature=ExpressionClusterFeature.STUDIES, expression_cluster_type=ExpressionClusterType.GENEREG) if ec_genereg_terms: several_word = "" if len(ec_genereg_terms) > 3: t_p = [ t_p for t_p in sorted( [[term, api_manager.get_textpresso_popularity(term)] for term in ec_genereg_terms], key=lambda x: (x[1], x[0][1]), reverse=True) ] ec_genereg_terms = [term for term, popularity in t_p[0:3]] several_word = "several genes including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_GENE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_genereg_terms, separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_genereg_studies, additional_postfix_final_word="studies", use_single_form=True) if ec_molreg_terms: several_word = "" if len(ec_molreg_terms) > 3: several_word = num2words( len(ec_molreg_terms)) + " chemicals including " gene_desc.set_or_extend_module_description_and_final_stats( module=Module.EXPRESSION_CLUSTER_MOLECULE, description="is affected by " + several_word + concatenate_words_with_oxford_comma( ec_molreg_terms[0:3], separator=conf_parser.get_terms_delimiter()) + " based on", additional_postfix_terms_list=ec_molreg_studies, additional_postfix_final_word="studies", use_single_form=True)
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str, api_manager: APIManager, config: GenedescConfigParser): """build orthology sentence for WormBase non-human hortologs Args: orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol orthologs_sp_fullname (str): full name of species from which to extract orthologs api_manager (APIManager): api manager to send requests to wormbase and textpresso config (GenedescConfigParser): a gene description configuration object Returns: str: the orthology sentence """ orth_sentence = None if len(orthologs) > 0: fullname_arr = orthologs_sp_fullname.split(" ") if len(fullname_arr[0]) > 2: fullname_arr[0] = fullname_arr[0][0] + "." orthologs_sp_fullname = " ".join(fullname_arr) if len(orthologs) > 3: # sort orthologs by tpc popularity and alphabetically (if tied) orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for ortholog in orthologs], key=lambda x: (x[1], x[0][1]), reverse=True)] classes_orth_pop = defaultdict(list) orthologs_pop_wo_class = [] for o_p in orthologs_pop: gene_class = api_manager.get_gene_class(o_p[0][0]) if gene_class: classes_orth_pop[gene_class].append(o_p) else: orthologs_pop_wo_class.append(o_p) if len(list(classes_orth_pop.keys())) == 1: orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]]) classes_orth_pop = {} else: for gene_class, orths_with_pop in classes_orth_pop.items(): if len(orths_with_pop) == 1: orthologs_pop_wo_class.extend(orths_with_pop) classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1} sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class] sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()]) sorted_items.sort(key=lambda x: x[0][1], reverse=True) if len(sorted_items) > 3: sorted_items = sorted_items[0:3] gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0] classes_symbols = [item[2] for item in sorted_items if item[1] == 1] genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1] sentences_arr = [] if len(gene_symbols_wo_class) > 0: sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma( gene_symbols_wo_class, separator=config.get_terms_delimiter())) if len(classes_symbols) > 0: genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma( genes_symbols_in_classes, separator=config.get_terms_delimiter()) classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols, separator=config.get_terms_delimiter()) classes_word = "classes" if len(classes_symbols) > 1 else "class" sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent + " gene " + classes_word + " including " + genes_symbols_in_classes_sent) orth_sentence = "is an ortholog of " + " and ".join(sentences_arr) else: # sort orthologs alphabetically orthologs_symbols = sorted([orth[1] for orth in orthologs]) orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \ concatenate_words_with_oxford_comma(orthologs_symbols, separator=config.get_terms_delimiter()) return orth_sentence