Пример #1
0
def set_information_poor_sentence(orth_fullnames: List[str], selected_orthologs, ensembl_hgnc_ids_map,
                                  conf_parser: GenedescConfigParser, human_df_agr: DataManager,
                                  gene_desc: GeneDescription, dm: WBDataManager, gene: Gene):
    if len(orth_fullnames) == 1 and orth_fullnames[0] == "H**o sapiens":
        best_orth = get_best_human_ortholog_for_info_poor(selected_orthologs, ensembl_hgnc_ids_map,
                                                          conf_parser.get_annotations_priority(module=Module.GO),
                                                          human_df_agr, config=conf_parser)
        if best_orth:
            if not best_orth.startswith("RGD:"):
                best_orth = "RGD:" + best_orth
            human_go_sent_generator = OntologySentenceGenerator(gene_id=best_orth, module=Module.GO,
                                                                data_manager=human_df_agr, config=conf_parser,
                                                                humans=False, limit_to_group="EXPERIMENTAL")
            human_func_module_sentences = human_go_sent_generator.get_module_sentences(
                config=conf_parser, aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True)
            human_func_sent = human_func_module_sentences.get_description()
            if human_func_sent:
                gene_desc.set_or_extend_module_description_and_final_stats(
                    module=Module.INFO_POOR_HUMAN_FUNCTION, description="human " +
                                                                        human_df_agr.go_associations.subject_label_map[
                                                                            best_orth] + " " + human_func_sent)

    protein_domains = dm.protein_domains[gene_desc.gene_id[3:]]
    if protein_domains:
        dom_word = "domain"
        if len(protein_domains) > 1:
            dom_word = "domains"
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.PROTEIN_DOMAIN,
            description="is predicted to encode a protein with the following " + dom_word + ": " +
                        concatenate_words_with_oxford_comma([ptdom[1] if ptdom[1] != "" else ptdom[0] for
                                                             ptdom in protein_domains]))
Пример #2
0
def generate_ortholog_sentence_wormbase_human(
        orthologs: List[List[str]], human_genes_props: Dict[str, List[str]]):
    """build orthology sentence for WormBase human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol
        human_genes_props (Dict[str, List[str]]): dictionary containing human gene properties
    Returns:
        Tuple[list, str]: the orthologs and the sentence
    """
    prefix = "human "
    if len(orthologs) > 3:
        orthologs = orthologs[0:3]
        prefix = "several human genes including "
    symbol_name_arr = sorted([
        human_genes_props[best_orth[0]][1] + " (" +
        human_genes_props[best_orth[0]][2] +
        ")" if best_orth[0] in human_genes_props
        and human_genes_props[best_orth[0]] else best_orth[1]
        for best_orth in orthologs
    ])
    orth_sentence = "is an ortholog of " + prefix + concatenate_words_with_oxford_comma(
        symbol_name_arr)
    return [
        human_genes_props[best_orth[0]][1] for best_orth in orthologs if
        best_orth[0] in human_genes_props and human_genes_props[best_orth[0]]
    ], orth_sentence
Пример #3
0
def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription,
                                        config: GenedescConfigParser, excluded_orthologs: bool = False):
    """set orthology module for Alliance human orthologs

    Args:
        orthologs (List[List[str]]): list of human orthologs, containing gene_id, gene_symbol, and gene_name
        gene_desc (GeneDescription): the gene description object to update
        config (GenedescConfigParser): a gene descriptions configuration object
        excluded_orthologs (bool): whether some of the orthologs have been excluded from the final set. If true, the
            final sentence will include a prefix to specify that some orthologs have been omitted
    """
    if len(orthologs) > 0:
        prefix = "human"
        orthologs_display = sorted(orthologs, key=lambda x: x[2])
        if excluded_orthologs or len(orthologs) > 3:
            orthologs_display = orthologs_display[0:3]
            prefix = "several human genes including"
        sentence = "orthologous to " + prefix + " " + concatenate_words_with_oxford_comma(
            [orth[1] + " (" + orth[2] + ")" if orth[2] else orth[1] for orth in orthologs_display],
            separator=config.get_terms_delimiter())
        gene_desc.set_or_extend_module_description_and_final_stats(module=Module.ORTHOLOGY, description=sentence)
def set_expression_cluster_sentence(dm: WBDataManager,
                                    conf_parser: GenedescConfigParser,
                                    gene_desc: GeneDescription, gene: Gene,
                                    api_manager: APIManager):

    expr_sentence_generator = OntologySentenceGenerator(
        gene_id=gene.id,
        module=Module.EXPRESSION,
        data_manager=dm,
        config=conf_parser)
    ec_gene_id = gene_desc.gene_id[3:]
    ec_anatomy_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.ANATOMY,
        feature=ExpressionClusterFeature.STUDIES)
    ec_anatomy_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.TERMS,
        expression_cluster_type=ExpressionClusterType.ANATOMY)
    if dm.expression_ontology is not None:
        expression_enriched_module_sentences = expr_sentence_generator.get_module_sentences(
            aspect='A',
            qualifier="Enriched",
            merge_groups_with_same_prefix=True,
            keep_only_best_group=False)
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description=expression_enriched_module_sentences.get_description(),
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    elif ec_anatomy_terms:
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_ANATOMY,
            description="is enriched in " +
            concatenate_words_with_oxford_comma(
                ec_anatomy_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_anatomy_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    ec_molreg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.MOLREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_molreg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.MOLREG)
    ec_genereg_terms = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        expression_cluster_type=ExpressionClusterType.GENEREG,
        feature=ExpressionClusterFeature.TERMS)
    ec_genereg_studies = dm.get_expression_cluster_feature(
        gene_id=ec_gene_id,
        feature=ExpressionClusterFeature.STUDIES,
        expression_cluster_type=ExpressionClusterType.GENEREG)
    if ec_genereg_terms:
        several_word = ""
        if len(ec_genereg_terms) > 3:
            t_p = [
                t_p for t_p in sorted(
                    [[term, api_manager.get_textpresso_popularity(term)]
                     for term in ec_genereg_terms],
                    key=lambda x: (x[1], x[0][1]),
                    reverse=True)
            ]
            ec_genereg_terms = [term for term, popularity in t_p[0:3]]
            several_word = "several genes including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_GENE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_genereg_terms,
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_genereg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    if ec_molreg_terms:
        several_word = ""
        if len(ec_molreg_terms) > 3:
            several_word = num2words(
                len(ec_molreg_terms)) + " chemicals including "
        gene_desc.set_or_extend_module_description_and_final_stats(
            module=Module.EXPRESSION_CLUSTER_MOLECULE,
            description="is affected by " + several_word +
            concatenate_words_with_oxford_comma(
                ec_molreg_terms[0:3],
                separator=conf_parser.get_terms_delimiter()) + " based on",
            additional_postfix_terms_list=ec_molreg_studies,
            additional_postfix_final_word="studies",
            use_single_form=True)
    def set_or_extend_module_description_and_final_stats(self, module: Module,
                                                         module_sentences: ModuleSentences = None,
                                                         description: str = None,
                                                         additional_postfix_terms_list: List[str] = None,
                                                         additional_postfix_final_word: str = None,
                                                         use_single_form: bool = False):
        """set description text and stats for a specific module

        if previous data is present in the specified module, the provided description and the stats are merged with the
        existing ones

        Args:
            module (Module): the description module to update
            module_sentences (ModuleSentences): optional - module sentences object from which to take the description
                and stats
            description (str): optional - description text to be added
            additional_postfix_terms_list (List[str]): optional - list of terms to be merged and added as postfix to the
                description
            additional_postfix_final_word: optional - word to be added at the end of the postfix (automatically
                converted to plural if the list of terms has more than one element)
            use_single_form (bool): whether to use a single form for the final word without transforming it to plural
        """
        desc = ""
        if module_sentences:
            desc = module_sentences.get_description()
            self.stats.trimmed = self.stats.trimmed or any([sent.trimmed for sent in module_sentences.sentences])
        elif description:
            inflect_engine = inflect.engine()
            desc = description
            if additional_postfix_terms_list and len(additional_postfix_terms_list) > 0:
                desc += " " + concatenate_words_with_oxford_comma(additional_postfix_terms_list) + " " + \
                        (additional_postfix_final_word if use_single_form or len(additional_postfix_terms_list) == 1
                         else inflect_engine.plural_noun(additional_postfix_final_word))
        if desc:
            if self.description and self.description != self.gene_name:
                self.description = self.description[0:-1] + "; " + desc + "."
            else:
                if not self.add_gene_name or not self.gene_name:
                    desc = desc[0].upper() + desc[1:]
                self.description = self.gene_name + " " + desc + "." if self.add_gene_name else desc + "."
            if module == Module.GO_FUNCTION:
                self.go_function_description = self._concatenate_description(desc, self.go_function_description)
                self.stats.set_final_go_ids_f = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                     self.stats.set_final_go_ids_f)
                self.stats.set_final_experimental_go_ids_f = self._get_merged_ids(module_sentences.get_ids(
                    experimental_only=True), self.stats.set_final_experimental_go_ids_f)
            elif module == Module.GO_PROCESS:
                self.go_process_description = self._concatenate_description(desc, self.go_process_description)
                self.stats.set_final_go_ids_p = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                     self.stats.set_final_go_ids_p)
                self.stats.set_final_experimental_go_ids_p = self._get_merged_ids(module_sentences.get_ids(
                    experimental_only=True), self.stats.set_final_experimental_go_ids_p)
            elif module == Module.GO_COMPONENT:
                self.go_component_description = self._concatenate_description(desc, self.go_component_description)
                self.stats.set_final_go_ids_c = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                     self.stats.set_final_go_ids_c)
                self.stats.set_final_experimental_go_ids_c = self._get_merged_ids(module_sentences.get_ids(
                    experimental_only=True), self.stats.set_final_experimental_go_ids_c)
            elif module == Module.EXPRESSION:
                self.tissue_expression_description = self._concatenate_description(desc, self.tissue_expression_description)
                self.stats.set_final_expression_ids = self._get_merged_ids(
                    module_sentences.get_ids(experimental_only=False), self.stats.set_final_expression_ids)
            elif module == Module.EXPRESSION_CLUSTER_GENE:
                self.gene_expression_cluster_description = self._concatenate_description(
                    desc, self.gene_expression_cluster_description)
            elif module == Module.EXPRESSION_CLUSTER_ANATOMY:
                self.anatomy_expression_cluster_description = self._concatenate_description(
                    desc, self.anatomy_expression_cluster_description)
            elif module == Module.EXPRESSION_CLUSTER_MOLECULE:
                self.molecule_expression_cluster_description = self._concatenate_description(
                    desc, self.molecule_expression_cluster_description)
            elif module == Module.DO_EXPERIMENTAL:
                self.do_experimental_description = self._concatenate_description(desc, self.do_experimental_description)
                self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                   self.stats.set_final_do_ids)
            elif module == Module.DO_BIOMARKER:
                self.do_biomarker_description = self._concatenate_description(desc, self.do_biomarker_description)
                self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                   self.stats.set_final_do_ids)
            elif module == Module.DO_ORTHOLOGY:
                self.do_orthology_description = self._concatenate_description(desc, self.do_orthology_description)
                self.stats.set_final_do_ids = self._get_merged_ids(module_sentences.get_ids(experimental_only=False),
                                                                   self.stats.set_final_do_ids)
            elif module == Module.SISTER_SP:
                self.sister_species_description = self._concatenate_description(desc, self.sister_species_description)
            elif module == Module.ORTHOLOGY:
                self.orthology_description = self._concatenate_description(desc, self.orthology_description)
            elif module == Module.INFO_POOR_HUMAN_FUNCTION:
                self.human_gene_function_description = self._concatenate_description(desc, self.human_gene_function_description)
            elif module == Module.PROTEIN_DOMAIN:
                self.protein_domain_description = self._concatenate_description(desc, self.protein_domain_description)
            # Multimodule fields
            if module == Module.GO_PROCESS or module == Module.GO_FUNCTION or module == Module.GO_COMPONENT:
                self.go_description = self._merge_descriptions(
                    [self.go_function_description, self.go_process_description, self.go_component_description])
            if module == Module.DO_EXPERIMENTAL or module == Module.DO_BIOMARKER or module == Module.DO_ORTHOLOGY:
                self.do_description = self._merge_descriptions(
                    [self.do_experimental_description, self.do_biomarker_description, self.do_orthology_description])
                self.stats.number_final_do_term_covering_multiple_initial_do_terms = self.do_description.count(
                    "(multiple)")
Пример #6
0
def generate_ortholog_sentence_wormbase_non_c_elegans(orthologs: List[List[str]], orthologs_sp_fullname: str,
                                                      api_manager: APIManager, config: GenedescConfigParser):
    """build orthology sentence for WormBase non-human hortologs

        Args:
            orthologs (List[str]): list of human orthologs, containing gene_id, gene_symbol
            orthologs_sp_fullname (str): full name of species from which to extract orthologs
            api_manager (APIManager): api manager to send requests to wormbase and textpresso
            config (GenedescConfigParser): a gene description configuration object
        Returns:
            str: the orthology sentence
        """
    orth_sentence = None
    if len(orthologs) > 0:
        fullname_arr = orthologs_sp_fullname.split(" ")
        if len(fullname_arr[0]) > 2:
            fullname_arr[0] = fullname_arr[0][0] + "."
            orthologs_sp_fullname = " ".join(fullname_arr)
        if len(orthologs) > 3:
            # sort orthologs by tpc popularity and alphabetically (if tied)
            orthologs_pop = [o_p for o_p in sorted([[ortholog, api_manager.get_textpresso_popularity(ortholog[1])] for
                                                    ortholog in orthologs], key=lambda x: (x[1], x[0][1]),
                                                   reverse=True)]
            classes_orth_pop = defaultdict(list)
            orthologs_pop_wo_class = []
            for o_p in orthologs_pop:
                gene_class = api_manager.get_gene_class(o_p[0][0])
                if gene_class:
                    classes_orth_pop[gene_class].append(o_p)
                else:
                    orthologs_pop_wo_class.append(o_p)
            if len(list(classes_orth_pop.keys())) == 1:
                orthologs_pop_wo_class.extend(classes_orth_pop[list(classes_orth_pop.keys())[0]])
                classes_orth_pop = {}
            else:
                for gene_class, orths_with_pop in classes_orth_pop.items():
                    if len(orths_with_pop) == 1:
                        orthologs_pop_wo_class.extend(orths_with_pop)
            classes_orth_pop = {gene_class: ops[0] for gene_class, ops in classes_orth_pop.items() if len(ops) > 1}
            sorted_items = [[o_p, 0] for o_p in orthologs_pop_wo_class]
            sorted_items.extend([[o_p, 1, gene_class] for gene_class, o_p in classes_orth_pop.items()])
            sorted_items.sort(key=lambda x: x[0][1], reverse=True)
            if len(sorted_items) > 3:
                sorted_items = sorted_items[0:3]
            gene_symbols_wo_class = [item[0][0][1] for item in sorted_items if item[1] == 0]
            classes_symbols = [item[2] for item in sorted_items if item[1] == 1]
            genes_symbols_in_classes = [item[0][0][1] for item in sorted_items if item[1] == 1]
            sentences_arr = []
            if len(gene_symbols_wo_class) > 0:
                sentences_arr.append(orthologs_sp_fullname + " " + concatenate_words_with_oxford_comma(
                    gene_symbols_wo_class, separator=config.get_terms_delimiter()))
            if len(classes_symbols) > 0:
                genes_symbols_in_classes_sent = concatenate_words_with_oxford_comma(
                    genes_symbols_in_classes, separator=config.get_terms_delimiter())
                classes_symbols_sent = concatenate_words_with_oxford_comma(classes_symbols,
                                                                           separator=config.get_terms_delimiter())
                classes_word = "classes" if len(classes_symbols) > 1 else "class"
                sentences_arr.append("members of the " + orthologs_sp_fullname + " " + classes_symbols_sent +
                                     " gene " + classes_word + " including " + genes_symbols_in_classes_sent)
            orth_sentence = "is an ortholog of " + " and ".join(sentences_arr)
        else:
            # sort orthologs alphabetically
            orthologs_symbols = sorted([orth[1] for orth in orthologs])
            orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \
                            concatenate_words_with_oxford_comma(orthologs_symbols,
                                                                separator=config.get_terms_delimiter())
    return orth_sentence