Пример #1
0
def synonym_name_details(session: Session, synonym_name: str) -> Tuple:
    r"""Get synonym details.

        Process the synonym_name given and check for organism specific stuff
        and report back organism as well as the plain text and sgml versions.

        if synonym has '\' in it the split and use first bit as the species abbreviation
        Also check for species starting with T: as this is some special shit.

    Args:
        session (sqlalchemy.orm.session.Session object): db connection to use.

        synonym_name (str): synonym name to be processed.

    Returns:
        organism for the entry,

        plain-text version of name,

        unicode version of text with sup to sgml

    NOTE:
        So for synonym_name of 'Hsap\0005-&agr;-[001]

        organism -> Organism object for h**o sapiens

        plain text -> 'Hsap\\00005-alpha-[001]'

        unicode version -> 'Hsap\\00005-α-<up>001</up>'
    """
    pattern = r"""
        ^([A-Z]:){0,1}   # May have T: or not {0 or 1} Not sure of variety so any captial letter is fine
        ([^\\\s]+)       # possible species abbreviation, Non space chars and not a '\'
        \\               # forward slash
        (.*)             # anything else
    """
    s_res = re.search(pattern, synonym_name, re.VERBOSE)

    if s_res:  # matches the pattern above
        t_bit = s_res.group(1)
        abbr = s_res.group(2)
        end_name = s_res.group(3)
        try:
            organism = get_organism(session, short=abbr)
        except CodingError:  # Not a species abbr so continue as normal
            return get_default_organism(session), sgml_to_plain_text(
                synonym_name), sgml_to_unicode(sub_sup_to_sgml(synonym_name))

        name = "{}{}\\{}".format(t_bit or '', abbr, end_name)
        return organism, sgml_to_plain_text(name), sgml_to_unicode(
            sub_sup_to_sgml(name))
    else:
        return get_default_organism(session), sgml_to_plain_text(
            synonym_name), sgml_to_unicode(sub_sup_to_sgml(synonym_name))
Пример #2
0
    def get_agr_title(self):
        """Convert chars and sub/superscript tags in FlyBase title for AGR export."""
        # First determine if there's any title and/or volumetitle.
        if self.title is not None and self.volumetitle is not None:
            title_to_use = self.title + ' ' + self.volumetitle
        elif self.title is not None:
            title_to_use = self.title
        else:
            # Special exception for compendia: use pub.miniref if pub.title is null.
            if self.pub_type == 'compendium':
                title_to_use = self.miniref
            else:
                self.processing_warnings.append('No pub.title available.')
                title_to_use = 'No title available.'

        # Once title to use had been chosen, convert problematic characters.
        title_to_use = sub_sup_sgml_to_html(title_to_use)
        try:
            title_to_use = sgml_to_unicode(title_to_use)
        except KeyError:
            self.processing_warnings.append(
                'Atypical sgml character(s) in title.')
        self.agr_title = title_to_use

        return
Пример #3
0
    def get_agr_citation(self):
        """Convert chars and sub/superscript tags in FlyBase miniref for AGR export."""
        if self.miniref is None:
            self.processing_warnings.append('No pub.miniref available.')
            self.agr_citation = 'No citation available.'
        else:
            converted_miniref = sgml_to_unicode(self.miniref)
            self.agr_citation = sub_sup_sgml_to_html(converted_miniref)

        return
Пример #4
0
    def get_agr_abstract(self):
        """Convert chars and sub/superscript tags in FlyBase abstract for AGR export."""
        if self.pubmed_abstract != []:
            if len(self.pubmed_abstract) > 1:
                self.processing_warnings.append(
                    'Pub has many abstracts ({}). Using the first one.')
            abstract_to_use = sub_sup_sgml_to_html(self.pubmed_abstract[0])
            try:
                abstract_to_use = sgml_to_unicode(abstract_to_use)
            except KeyError:
                self.processing_warnings.append(
                    'Atypical sgml character(s) in abstract.')
            self.agr_abstract = abstract_to_use

        return
Пример #5
0
    def get_agr_title(self):
        """Convert chars and sub/superscript tags in FlyBase title for AGR export."""
        # First determine if there's any title and/or volumetitle.
        if self.title is None:
            self.processing_warnings.append('No pub.title available.')
            title_to_use = 'No title available.'
        elif self.volumetitle is None:
            title_to_use = self.title
        else:
            title_to_use = self.title + ' ' + self.volumetitle
        # Next convert, handling odd chars that will raise error in "sgml_to_unicode" function.
        title_to_use = sub_sup_sgml_to_html(title_to_use)
        try:
            title_to_use = sgml_to_unicode(title_to_use)
        except KeyError:
            self.processing_warnings.append(
                'Atypical sgml character(s) in title.')
        self.agr_title = title_to_use

        return
Пример #6
0
 def make_description(self):
     """Concatenate "nature_lesion" strings into a description."""
     if self.molecular_info is None:
         log.warning(
             'Allele {} missing "molecular_info" info for description.'.
             format(self.uniquename))
         self.description = None
     elif self.aminoacid_rep is None:
         log.warning(
             'Allele {} missing "aminoacid_rep" info for description.'.
             format(self.uniquename))
         self.description = None
     elif self.nucleotide_sub is None:
         log.warning(
             'Allele {} missing "nucleotide_sub" info for description.'.
             format(self.uniquename))
         self.description = None
     else:
         nature_lesion_list = []
         nature_lesion_list.extend(self.molecular_info)
         nature_lesion_list.extend(self.aminoacid_rep)
         nature_lesion_list.extend(self.nucleotide_sub)
     if len(nature_lesion_list) > 0:
         nature_lesion = ' '.join(nature_lesion_list)
         nature_lesion = nature_lesion.replace('@', '')
         nature_lesion = sub_sup_to_sgml(
             nature_lesion)  # Convert brackets into FB sub/superscript.
         nature_lesion = sub_sup_sgml_to_html(
             nature_lesion)  # Convert FB sub/superscript to html.
         nature_lesion = sgml_to_unicode(
             nature_lesion)  # Convert FB "&.gr;" Greeks to unicode.
         self.description = nature_lesion
     else:
         log.debug(
             'Allele {} has no nature_lesion to report for description'.
             format(self.uniquename))
         self.description = None
     return
Пример #7
0
def feature_symbol_lookup(session: Session,
                          type_name: str,
                          synonym_name: str,
                          organism_id: Optional[int] = None,
                          cv_name: str = 'synonym type',
                          cvterm_name: str = 'symbol',
                          check_unique: bool = True,
                          obsolete: str = 'f',
                          convert: bool = True) -> Feature:
    """Lookup feature that has a specific type and synonym name.

    Args:
        session (sqlalchemy.orm.session.Session object): db connection  to use.

        type_name (str): <can be None> cvterm name, defining the type of feature.

        synonym_name (str): symbol to look up.

        organism_id (int): <optional> chado organism_id.

        cv_name (str): <optional> cv name defaults too 'synonym type'

        cvterm_name (str): <optional> cvterm name defaults too 'symbol'

        check_uniuqe (Bool): <optional> Set to false to fetch more than one feature with that symbol.

        obsolete ('t', 'f', 'e'): <optional> is feature obsolete
                                  t = true
                                  f = false (default)
                                  e = either not fussed.

        convert (Bool): <optional> set to True
                        wether to convert chars i.e. '[' to '<up' etc

    ONLY replace cvterm_name and cv_name if you know what exactly you are doing.
    symbol lookups are kind of special and initialized here for ease of use.

    Returns:
        Feature object or list of feature object if check_unique is passed as False.

    Raises:
        NoResultFound: If no feature found matching the synonym.

        MultipleResultsFound: If more than one feature found matching the synonym.
    """
    # Default to Dros if not organism specified.
    if not organism_id:
        organism, plain_name, synonym_sgml = synonym_name_details(
            session, synonym_name)
        organism_id = organism.organism_id
    else:
        # convert name to sgml format for lookup
        synonym_sgml = sgml_to_unicode(sub_sup_to_sgml(synonym_name))
    if not convert:
        synonym_sgml = synonym_name

    # Check cache
    if type_name in feature_cache and synonym_sgml in feature_cache[type_name]:
        return feature_cache[type_name][synonym_sgml]

    synonym_type = get_cvterm(session, cv_name, cvterm_name)
    check_obs = _check_obsolete(obsolete)
    filter_spec: Any = (Synonym.type_id == synonym_type.cvterm_id,
                        Synonym.synonym_sgml == synonym_sgml,
                        Feature.organism_id == organism_id,
                        FeatureSynonym.is_current == 't')

    if check_obs:
        filter_spec += (Feature.is_obsolete == obsolete, )
    if not type_name or type_name == 'gene':
        filter_spec += (~Feature.uniquename.contains('FBog'), )
    if type_name:
        feature_type = feature_type_lookup(session, type_name)
        filter_spec += (Feature.type_id == feature_type.cvterm_id, )

    if check_unique:
        feature = session.query(Feature).join(FeatureSynonym).join(Synonym).\
            filter(*filter_spec).one()
        add_to_cache(feature, synonym_sgml)
    else:
        feature = session.query(Feature).join(FeatureSynonym).join(Synonym).\
            filter(*filter_spec).all()

    return feature
Пример #8
0
def feature_synonym_lookup(session: Session,
                           type_name: str,
                           synonym_name: str,
                           organism_id: Optional[int] = None,
                           cv_name: str = 'synonym type',
                           cvterm_name: str = 'symbol',
                           check_unique: bool = False,
                           obsolete: str = 'f'):
    """Get feature from the synonym.

    Lookup to see if the synonym has been used before. Even if not current.
    Check for uniqueness if requested.

    Args:
        session (sqlalchemy.orm.session.Session object): db connection  to use.

        type_name (str): cvterm name, defining the type of feature.

        synonym_name (str): symbol to look up.

        organism_id (int): <optional> chado organism_id.

        cv_name (str): <optional> cv name defaults too 'synonym type'

        cvterm_name (str): <optional> cvterm name defaults too 'symbol'

        obsolete ('t', 'f', 'e'): <optional> is feature obsolete
                                  t = true
                                  f = false (default)
                                  e = either not fussed.

    Returns:
        List of feature objects or Feature depending on check_unique.

    Raises:
        DataError: If cvterm for type not found.
                   If feature cannot be found uniquely.

    """
    check_obs = _check_obsolete(obsolete)

    # Default to Dros if not organism specified.
    if not organism_id:
        organism_id = get_default_organism_id(session)

    # convert name to sgml format for lookup
    synonym_sgml = sgml_to_unicode(sub_sup_to_sgml(synonym_name))

    # check cache
    if type_name in feature_cache and synonym_sgml in feature_cache[type_name]:
        return feature_cache[type_name][synonym_sgml]

    # get feature type expected from type_name
    feature_type = feature_type_lookup(session, type_name)
    synonym_type = get_cvterm(session, cv_name, cvterm_name)

    filter_spec: Any = (
        Synonym.type_id == synonym_type.cvterm_id,
        Synonym.synonym_sgml == synonym_sgml,
        Feature.organism_id == organism_id,
        Feature.type_id == feature_type.cvterm_id,
    )

    if check_obs:
        filter_spec += (Feature.is_obsolete == obsolete, )

    try:
        features = session.query(Feature).join(FeatureSynonym).join(Synonym).\
            filter(*filter_spec).all()
    except NoResultFound:
        raise DataError(
            "DataError: Could not find current synonym '{}', sgml = '{}' for type '{}'."
            .format(synonym_name, synonym_sgml, cvterm_name))

    if not check_unique:
        return features

    # fs has pub so there may be many of the same symbols with different pubs
    # check this is the case.
    uniquecheck = None
    for feat in features:
        if uniquecheck and uniquecheck != feat.uniquename:
            raise DataError(
                "DataError: Could not find UNIQUE current synonym '{}', sgml = '{}' for type '{}'."
                .format(synonym_name, synonym_sgml, cvterm_name))
        else:
            uniquecheck = feat.uniquename

    if uniquecheck:
        add_to_cache(feat)
        return feat

    raise DataError(
        "DataError: Could not find current unique synonym '{}', sgml = '{}' for type '{}'."
        .format(synonym_name, synonym_sgml, cvterm_name))
Пример #9
0
def test_sgml_to_unicode(key):
    assert sgml_to_unicode(dict_to_test[key]) == dict_to_test[key]
Пример #10
0
def general_symbol_lookup(session: Session,
                          sql_object_type: GeneralObjects,
                          syn_object_type,
                          type_name: str,
                          synonym_name: str,
                          organism_id: int = None,
                          cv_name: str = 'synonym type',
                          cvterm_name: str = 'symbol',
                          check_unique: bool = True,
                          obsolete: str = 'f',
                          convert: bool = True):
    """Lookup "other" feature that has a specific type and synonym name.

    Args:
        session: db connection  to use.

        sql_object_type (sqlalchemy object type): i.e. Grp, CellLine, Strain

        type_name (str): <can be None> cvterm name, defining the type of feature.

        synonym_name (str): symbol to look up.

        organism_id (int): <optional> chado organism_id.

        cv_name (str): <optional> cv name defaults too 'synonym type'

        cvterm_name (str): <optional> cvterm name defaults too 'symbol'

        check_uniuqe (Bool): <optional> Set to false to fetch more than one feature with that symbol.

        obsolete ('t', 'f', 'e'): <optional> is feature obsolete
                                  t = true
                                  f = false (default)
                                  e = either not fussed.

        convert (Bool): <optional> set to True
                        wether to convert chars i.e. '[' to '<up' etc

    ONLY replace cvterm_name and cv_name if you know what exactly you are doing.
    symbol lookups are kind of special and initialized here for ease of use.

    Returns:
        Feature object or list of feature object if check_unique is passed as False.

    Raises:
        NoResultFound: If no feature found matching the synonym.

        MultipleResultsFound: If more than one feature found matching the synonym.
    """
    # Default to Dros if not organism specified.
    # if not organism_id:
    #     organism, plain_name, synonym_sgml = synonym_name_details(session, synonym_name)
    #     organism_id = organism.organism_id
    # else:
    #     # convert name to sgml format for lookup
    synonym_sgml = sgml_to_unicode(sub_sup_to_sgml(synonym_name))
    if not convert:
        synonym_sgml = synonym_name

    # Check cache
    if type_name in general_cache and synonym_sgml in general_cache[type_name]:
        return general_cache[type_name][synonym_sgml]

    synonym_type = get_cvterm(session, cv_name, cvterm_name)
    check_obs = _check_obsolete(obsolete)
    filter_spec: Any = (Synonym.synonym_sgml == synonym_sgml, )

    if type_name:
        filter_spec += (Synonym.type_id == synonym_type.cvterm_id, )

    # Note: type error messages suppressed here as the args should deal with
    #       inconsistences.
    if organism_id:
        filter_spec += (sql_object_type.organism_id == organism_id,
                        )  # type: ignore

    if check_obs:
        filter_spec += (sql_object_type.is_obsolete == obsolete,
                        )  # type: ignore

    if type_name:
        feature_type = general_type_lookup(session, type_name)
        filter_spec += (sql_object_type.type_id == feature_type.cvterm_id,
                        )  # type: ignore

    if check_unique:
        object = session.query(sql_object_type).join(syn_object_type).join(Synonym).\
            filter(*filter_spec).one()
    else:
        object = session.query(sql_object_type).join(syn_object_type).join(Synonym).\
            filter(*filter_spec).all()

    return object