예제 #1
0
def fetch_subspace(prob: int = 70, rmsd: float = 3.0,
                   ca_min: int = 10, ca_max: int = 200, score_tm_pair: float = 0.3, ratio: float = 1.25,
                   diff_folds: bool = True) -> Result:
    """ Returns the entries in Fuzzle that satisfy the conditions:

    :param prob: Lower cutoff for the hit probability
    :param rmsd: Upper cutoff for rmsd_tm_pair
    :param ca_min: Lower cutoff for the number of AA (ca_tm_pair)
    :param ca_max: Upper cutoff for the number of AA (ca_tm_pair)
    :param score_tm_pair: Lower cutoff for the tm_score
    :param ratio: Proportion between cols/ca_tm_pair
    :return: A Result object
    """

    diff = 'q_fold_id != s_fold_id'
    if diff_folds is False:
        diff = None

    ex = f"select * from hh207clusters where prob > {prob}" \
         f" and rmsd_tm_pair < {rmsd}" \
         f" and ca_tm_pair > {ca_min} and ca_tm_pair < {ca_max}" \
         f" and score_tm_pair > {score_tm_pair} " \
         f"and cast(cols as float)/ca_tm_pair < {ratio} "
    ex += f"and {diff} " if diff is not None else ""
    ex += f"and {cond} " if cond is not None else ""
    ex += f"and {cond1} " if cond1 is not None else ""
    ex += f"or {cond2} " if cond2 is not None else ""
    cur.execute(ex)

    ahits = []
    tmphits = cur.fetchall()
    for line in tmphits:
        ahits.append(parse_hit(line))
    return Result(ahits)
예제 #2
0
def fetch_group(group1, group2=None, prob: int = 70, rmsd: float = 3.0,
                ca_min: int = 10, ca_max: int = 200, score_tm_pair: float = 0.3, ratio: float = 1.25,
                diff_folds: bool = True) -> Result:
    """ Fetching all hits between two specific groups (folds, superfamilies and families)
    or inside one specific group (group1)

    :param group1: The first group from where to search. E.g 'c.2'
    :param group2 (optional): The second group from where to search. E.g 'c.2'
    :param prob: the minimum allowed HHsearch probability
    :param rmsd: The maximum allowed RMSD (rmsd_tm_pair: "RMSD for the TMalign alignment between the two domains, passing the sequence alignment as seed)
    :param ca_min: The minimum allowed fragment length (for the TMalign alignment)
    :param ca_max: The maximun allowed fragment length (for the TMalign alignment)
    :param score_tm_pair: The minimum allowed TM-score (for the TMalign alignment)
    :param ratio: the maximum ratio for the sequence and structural alignment lengths (cols / ca_tm_pair)
    :return: A Result class with the hits that fulfill these criteria
    """

    diff = 'q_fold_id != s_fold_id'
    if diff_folds is False:
        diff = None

    level1 = len(group1.split('.'))
    if group2:
        level2 = len(group2.split('.'))
        if level1 != level2:
            raise ValueError("The two groups must belong to the same SCOP level")

    if level1 == 2:  # folds
        scop = "q_fold_id"
    elif level1 == 3:  # superfamilies
        scop = "q_sufam_id"
    elif level1 == 4:  # families
        scop = "q_scop_id"
    else:
        raise ValueError("The specified SCOP level does not exist")

    cond1 = scop + "='" + group1 + "'"
    cond_1 = scop.replace('q', 's') + "='" + group1 + "'"

    if group2:
        cond2 = scop.replace('q', 's') + "='" + group2 + "'"
        cond_2 = scop + "='" + group2 + "'"

    ex = f"select * from hh207clusters where prob > {prob}" \
         f" and rmsd_tm_pair < {rmsd}" \
         f" and ca_tm_pair > {ca_min} and ca_tm_pair < {ca_max}" \
         f" and score_tm_pair > {score_tm_pair}" \
         f" and cast(cols as float)/ca_tm_pair < {ratio}"
    ex += f" and (({cond1} and {cond2}) " if group2 else f" and ({cond1}"
    ex += f" or ({cond_1} and {cond_2})) " if group2 else f" or {cond_1})"
    ex += f" and {diff} " if diff is not None else ""

    cur.execute(ex)
    ahits = []
    tmphits = cur.fetchall()
    for line in tmphits:
        ahits.append(parse_hit(line))
    return Result(ahits)
예제 #3
0
def fetch_id(fuzzle_id: int) -> Hit:
    """
    Returns the hit in fuzzle with that ID
    :param fuzzle_id: The Fuzzle HIT id to retrieve from hh207clusters
    :return: A Hit object
    """

    cur.execute("select * from hh207clusters where id = ?", (fuzzle_id,))

    tmphits = cur.fetchall()
    hit = parse_hit(list(tmphits[0]))
    return hit
예제 #4
0
def _domain_from_PDB(pdb: str):
    """ Returns all domains included in a SCOP version that come from a PDB.

    :param pdb: A 4-letter pdb code
    :return: A list of domains that this PDB contains
    """
    pdb = pdb.lower()
    cur.execute(f"SELECT sdomain FROM scop_pdbref_scop WHERE pdbref like '{pdb}'")
    hits = cur.fetchall()
    domains = tuple(np.unique([x[0] for x in hits]))
    if domains:
        if len(domains) == 1:
            domains = str("('" + domains[0] + "')")
        cur.execute(f"SELECT ref FROM astral_astral95 WHERE id in {domains}")
        hits = cur.fetchall()
        domains = np.unique([x[0] for x in hits])
        logger.info(f"Domain(s) {domains} are present in the Fuzzle database"
                    f" as representatives for pdb {pdb.capitalize()}")
    else:
        from protlego.builder.builder import NotCorrectPDBError
        raise NotCorrectPDBError(f"PDB code {pdb.capitalize()} does not yet appear in the SCOP database")

    return domains
예제 #5
0
def fetch_by_domains(domain1: str, domain2: str, prob: int = 70, rmsd: float = 3.0,
                     ca_min: int = 10, ca_max: int = 200, score_tm_pair: float = 0.3, ratio: float = 1.25,
                     diff_folds: bool = True):
    """ Fetch all the hits between two parent domains

    :param domain1: The 7 letter code for one of the parents
    :param domain2: The 7 letter code for one of the parents
    :param prob: the minimum allowed HHsearch probability
    :param rmsd: The maximum allowed RMSD (rmsd_tm_pair: "RMSD for the TMalign alignment between the two domains, passing the sequence alignment as seed)
    :param ca_min: The minimum allowed fragment length (for the TMalign alignment)
    :param ca_max: The maximun allowed fragment length (for the TMalign alignment)
    :param score_tm_pair: The minimum allowed TM-score (for the TMalign alignment)
    :param ratio: the maximum ratio for the sequence and structural alignment lengths (cols / ca_tm_pair)
    :param diff_folds: Whether to exclude hits from the same fold (True) or not (False)
    :return: A result class obtaining the hits that fulfill these criteria
    """
    diff = 'q_fold_id != s_fold_id'
    if diff_folds is False:
        diff = None
    ex = f"select * from hh207clusters where prob > {prob}" \
         f" and (rmsd_tm_pair < {rmsd} or rmsd_tm_pair is NULL)" \
         f" and ca_tm_pair > {ca_min} and ca_tm_pair < {ca_max}" \
         f" and score_tm_pair > {score_tm_pair}" \
         f" and cast(cols as float)/ca_tm_pair < {ratio}" \
         f" and ((query='{domain1}' and sbjct='{domain2}')" \
         f" or (sbjct = '{domain1}' and query='{domain2}'))" \
         f" and query != sbjct"
    ex += f" and {diff} " if diff is not None else ""

    cur.execute(ex)
    ahits = []
    tmphits = cur.fetchall()
    if len(tmphits) == 1:
        hit = parse_hit(tmphits[0])
    else:
        for line in tmphits:
            ahits.append(parse_hit(line))
        hit = Result(ahits)
    return hit
예제 #6
0
def fetch_byPDBs(pdb1: str, pdb2: str, prob: int = 70, rmsd: float = 3.0,
                 ca_min: int = 10, ca_max: int = 200, score_tm_pair: float = 0.3, ratio: float = 1.25,
                 diff_folds: bool = True):
    """ Includes all hits among the domains that belong to a pair of PDBs

    :param pdb1: The first PDB to check
    :param pdb2: The second PDB to check
    :param prob: the minimum allowed HHsearch probability
    :param rmsd: The maximum allowed RMSD (rmsd_tm_pair: "RMSD for the TMalign alignment between the two domains, passing the sequence alignment as seed)
    :param ca_min: The minimum allowed fragment length (for the TMalign alignment)
    :param ca_max: The maximun allowed fragment length (for the TMalign alignment)
    :param score_tm_pair: The minimum allowed TM-score (for the TMalign alignment)
    :param ratio: the maximum ratio for the sequence and structural alignment lengths (cols / ca_tm_pair)
    :param diff_folds: Whether to exclude hits from the same fold (True) or not (False)
    :return: A result class obtaining the hits that fulfill these criteria
    """
    domains1 = tuple(_domain_from_PDB(pdb1))
    domains2 = tuple(_domain_from_PDB(pdb2))
    if len(domains1) == 1: domains1 == domains1[0]
    if len(domains2) == 1: domains2 == domains2[0]
    domains = domains1 + domains2
    diff = 'q_fold_id != s_fold_id'
    if diff_folds is False:
        diff = None
    ex = f"select * from hh207clusters where prob > {prob}" \
         f" and rmsd_tm_pair < {rmsd}" \
         f" and ca_tm_pair > {ca_min} and ca_tm_pair < {ca_max}" \
         f" and score_tm_pair > {score_tm_pair}" \
         f" and cast(cols as float)/ca_tm_pair < {ratio}"
    ex += f" and query in {domains}"
    ex += f" and sbjct in {domains}"
    ex += f" and {diff} " if diff is not None else ""

    cur.execute(ex)
    ahits = []
    tmphits = cur.fetchall()
    for line in tmphits:
        ahits.append(parse_hit(line))
    return Result(ahits)