def set_information_poor_sentence(orth_fullnames: List[str],
                                  selected_orthologs,
                                  conf_parser: GenedescConfigParser,
                                  human_df_agr: DataManager,
                                  gene_desc: GeneDescription,
                                  dm: WBDataManager, gene: Gene):
    if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
        best_orth = get_best_human_ortholog_for_info_poor(
            selected_orthologs,
            conf_parser.get_annotations_priority(module=Module.GO),
            human_df_agr,
            config=conf_parser)
        if best_orth:
            if not best_orth.startswith("RGD:"):
                best_orth = "RGD:" + best_orth
            human_go_sent_generator = OntologySentenceGenerator(
                gene_id=best_orth,
                module=Module.GO,
                data_manager=human_df_agr,
                config=conf_parser,
                humans=False,
                limit_to_group="EXPERIMENTAL")
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                aspect='F',
                qualifier="contributes_to",
                merge_groups_with_same_prefix=True,
                keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION,
                    description="human " +
                    human_df_agr.go_associations.subject_label_map[best_orth] +
                    " " + human_func_sent)
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                aspect='F',
                qualifier="enables",
                merge_groups_with_same_prefix=True,
                keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION,
                    description="human " +
                    human_df_agr.go_associations.subject_label_map[best_orth] +
                    " " + human_func_sent)

    protein_domains = dm.protein_domains[gene_desc.gene_id[3:]]
    if protein_domains:
        dom_word = "domain"
        if len([ptdom[1] for ptdom in protein_domains if ptdom[1] != ""]) > 1:
            dom_word = "domains"
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.PROTEIN_DOMAIN,
            description="is predicted to encode a protein with the following "
            + dom_word + ": " + concatenate_words_with_oxford_comma(
                [ptdom[1] for ptdom in protein_domains if ptdom[1] != ""],
                separator=conf_parser.get_terms_delimiter()))
예제 #2
0
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription,
                                        config: GenedescConfigParser, excluded_orthologs: bool = False):
    """set orthology module for Alliance human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name
        gene_desc (GeneDescription): the gene description object to update
        config (GenedescConfigParser): a gene descriptions configuration object
        excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the
            final sentence will include a prefix to specify that some orthologs have been omitted
    """
    if len(orthologs) > 0:
        prefix = "human"
        orthologs_display = sorted(orthologs, key=lambda x: x[2])
        if excluded_orthologs or len(orthologs) > 3:
            orthologs_display = orthologs_display[0:3]
            prefix = "several human genes including"
        sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma(
            [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display],
            separator=config.get_terms_delimiter())
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
예제 #3
0
def generate_ortholog_sentence_wormbase_human(orthologs: List[List[str]], human_genes_props: Dict[str, List[str]],
                                              config: GenedescConfigParser):
    """build orthology sentence for WormBase human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol
        human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties
        config (GenedescConfigParser): a gene description configuration object
    Returns:
        Tuple[list, str]: the orthologs and the sentence
    """
    prefix = "human "
    if len(orthologs) > 3:
        orthologs = orthologs[0:3]
        prefix = "several human genes including "
    symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] +
                              ")" if best_orth[0] in human_genes_props and human_genes_props[best_orth[0]] else
                              best_orth[1] for best_orth in orthologs])
    orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma(
        symbol_name_arr, separator=config.get_terms_delimiter())
    return [human_genes_props[best_orth[0]][0] for best_orth in orthologs if best_orth[0] in human_genes_props and
            human_genes_props[best_orth[0]]], orth_sentence
def set_expression_cluster_sentence(dm: WBDataManager,
                                    conf_parser: GenedescConfigParser,
                                    gene_desc: GeneDescription, gene: Gene,
                                    api_manager: APIManager):

    expr_sentence_generator = OntologySentenceGenerator(
        gene_id=gene.id,
        module=Module.EXPRESSION,
        data_manager=dm,
        config=conf_parser)
    ec_gene_id = gene_desc.gene_id[3:]
    ec_anatomy_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.ANATOMY,
        feature=ExpressionClusterFeature.STUDIES)
    ec_anatomy_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.TERMS,
        expression_cluster_type=ExpressionClusterType.ANATOMY)
    if dm.expression_ontology is not None:
        expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences(
            aspect='A',
            qualifier="Enriched",
            merge_groups_with_same_prefix=True,
            keep_only_best_group=False)
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description=expression_enriched_module_sentences.get_description(),
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    elif ec_anatomy_terms:
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description="is enriched in " +
            concatenate_words_with_oxford_comma(
                ec_anatomy_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    ec_molreg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.MOLREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_molreg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.MOLREG)
    ec_genereg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.GENEREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_genereg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.GENEREG)
    if ec_genereg_terms:
        several_word = ""
        if len(ec_genereg_terms) > 3:
            t_p = [
                t_p for t_p in sorted(
                    [[term, api_manager.get_textpresso_popularity(term)]
                     for term in ec_genereg_terms],
                    key=lambda x: (x[1], x[0][1]),
                    reverse=True)
            ]
            ec_genereg_terms = [term for term, popularity in t_p[0:3]]
            several_word = "several genes including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_GENE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_genereg_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_genereg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    if ec_molreg_terms:
        several_word = ""
        if len(ec_molreg_terms) > 3:
            several_word = num2words(
                len(ec_molreg_terms)) + " chemicals including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_MOLECULE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_molreg_terms[0:3],
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_molreg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
예제 #5
0
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str,
                                                      api_manager: APIManager, config: GenedescConfigParser):
    """build orthology sentence for WormBase non-human hortologs

        Args:
            orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol
            orthologs_sp_fullname (str): full name of species from which to extract orthologs
            api_manager (APIManager): api manager to send requests to wormbase and textpresso
            config (GenedescConfigParser): a gene description configuration object
        Returns:
            str: the orthology sentence
        """
    orth_sentence = None
    if len(orthologs) > 0:
        fullname_arr = orthologs_sp_fullname.split(" ")
        if len(fullname_arr[0]) > 2:
            fullname_arr[0] = fullname_arr[0][0] + "."
            orthologs_sp_fullname = " ".join(fullname_arr)
        if len(orthologs) > 3:
            # sort orthologs by tpc popularity and alphabetically (if tied)
            orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for
                                                    ortholog in orthologs], key=lambda x: (x[1], x[0][1]),
                                                   reverse=True)]
            classes_orth_pop = defaultdict(list)
            orthologs_pop_wo_class = []
            for o_p in orthologs_pop:
                gene_class = api_manager.get_gene_class(o_p[0][0])
                if gene_class:
                    classes_orth_pop[gene_class].append(o_p)
                else:
                    orthologs_pop_wo_class.append(o_p)
            if len(list(classes_orth_pop.keys())) == 1:
                orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]])
                classes_orth_pop = {}
            else:
                for gene_class, orths_with_pop in classes_orth_pop.items():
                    if len(orths_with_pop) == 1:
                        orthologs_pop_wo_class.extend(orths_with_pop)
            classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1}
            sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class]
            sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()])
            sorted_items.sort(key=lambda x: x[0][1], reverse=True)
            if len(sorted_items) > 3:
                sorted_items = sorted_items[0:3]
            gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0]
            classes_symbols = [item[2] for item in sorted_items if item[1] == 1]
            genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1]
            sentences_arr = []
            if len(gene_symbols_wo_class) > 0:
                sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma(
                    gene_symbols_wo_class, separator=config.get_terms_delimiter()))
            if len(classes_symbols) > 0:
                genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma(
                    genes_symbols_in_classes, separator=config.get_terms_delimiter())
                classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols,
                                                                           separator=config.get_terms_delimiter())
                classes_word = "classes" if len(classes_symbols) > 1 else "class"
                sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent +
                                     " gene " + classes_word + " including " + genes_symbols_in_classes_sent)
            orth_sentence = "is an ortholog of " + " and ".join(sentences_arr)
        else:
            # sort orthologs alphabetically
            orthologs_symbols = sorted([orth[1] for orth in orthologs])
            orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \
                            concatenate_words_with_oxford_comma(orthologs_symbols,
                                                                separator=config.get_terms_delimiter())
    return orth_sentence