示例#1
0
 def test_extract_by_reference_positions(self):
     sig = utils.extract_by_reference_positions("ABC-DE-F", "A-BC-DEF",
                                                [0, 1, 3, 4])
     assert sig == "ACE-"
     sig = utils.extract_by_reference_positions("ABCDF", "ABCDE",
                                                [0, 1, 3, 4])
     assert sig == "ABDF"
示例#2
0
def run_kr_analysis(
    queries: Dict[str, str]
) -> Tuple[Dict[str, Prediction], Dict[str, Prediction]]:
    """ Extract activity and stereochemistry signatures from KR domains

        Arguments:
            queries: a mapping of query CDS name to sequence

        Returns:
            a pair of dicts, one mapping query name to activity bool,
                the other mapping query name to stereochemistry (e.g. A2)
    """
    querysignames = []
    activity_signatures = []
    stereochem_signatures = []
    for name, seq in sorted(queries.items()):
        querysignames.append(name)
        muscle_dict = subprocessing.run_muscle_single(name, seq,
                                                      _KR_DOMAINS_FILENAME)

        positions_act = [110, 134, 147, 151]  # active site
        positions_ste = [90, 91, 92, 139, 144, 147, 149, 151]  # stereochem

        refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1"
        refseq = muscle_dict[refsequence]
        activity_signatures.append(
            utils.extract_by_reference_positions(muscle_dict[name], refseq,
                                                 positions_act))
        stereochem_signatures.append(
            utils.extract_by_reference_positions(muscle_dict[name], refseq,
                                                 positions_ste))

    # Check activity
    activity = {}  # type: Dict[str, Prediction]
    for name, signature in zip(querysignames, activity_signatures):
        if is_active(signature):
            activity[name] = SimplePrediction("kr_activity", "active")
        else:
            activity[name] = SimplePrediction("kr_activity", "inactive")

    # Predict stereochemistry
    stereochemistry = {}  # type: Dict[str, Prediction]
    for name, signature in zip(querysignames, stereochem_signatures):
        chem = predict_stereochemistry(signature)
        if chem:
            stereochemistry[name] = SimplePrediction("kr_stereochem", chem)

    return activity, stereochemistry
示例#3
0
def extract_cterminus(data_dir: str, cds_features: List[CDSFeature],
                      end_cds: Optional[CDSFeature]) -> Dict[str, str]:
    """ Extract C-terminal 100 residues of each non-ending protein,
        scan for docking domains, parse output to locate interacting residues

        Arguments:
            data_dir: the directory containing the C-terminal reference files
            cds_features: the list of CDSFeatures to extract terminals from
            end_cds: if not None, skips this CDS since C-terminals are irrelevant

        Returns:
            A dictionary mapping gene name to the pair of residues extracted
    """
    c_terminal_residues = {}
    c_terminals = {}  # type: Dict[str, str]
    cterm_file = os.path.join(data_dir, 'cterm.fasta')
    for cds in cds_features:
        if cds is not end_cds:
            seq = str(cds.translation)
            c_terminals[cds.get_name()] = seq[-100:]
    for name, seq in c_terminals.items():
        alignments = subprocessing.run_muscle_single(name, seq, cterm_file)
        query_seq = alignments[name]
        ref_seq = alignments["EryAII_ref"]
        c_terminal_residues[name] = utils.extract_by_reference_positions(
            query_seq, ref_seq, [55, 64])
    return c_terminal_residues
示例#4
0
def run_kr_analysis(
        queries: Dict[str, str]) -> Tuple[Dict[str, bool], Dict[str, str]]:
    """ Extract activity and stereochemistry signatures from KR domains

        Arguments:
            input_file: the fasta file to read queries from
            out_file: a filename, if provided, writes results to file as well

        Returns:
            a pair of dicts, one mapping query name to activity bool,
                the other mapping query name to stereochemistry (e.g. A2)
    """
    querysignames = []
    activity_signatures = []
    stereochem_signatures = []
    for name, seq in sorted(queries.items()):
        querysignames.append(name)
        muscle_dict = subprocessing.run_muscle_single(name, seq,
                                                      _KR_DOMAINS_FILENAME)

        positions_act = [110, 134, 147, 151]  # active site
        positions_ste = [90, 91, 92, 139, 144, 147, 149, 151]  # stereochem

        refsequence = "MAPSI|PKS|CAM00062.1|Erythromycin_synthase_modules_1_and_2|Sacc_KR1"
        refseq = muscle_dict[refsequence]
        activity_signatures.append(
            utils.extract_by_reference_positions(muscle_dict[name], refseq,
                                                 positions_act))
        stereochem_signatures.append(
            utils.extract_by_reference_positions(muscle_dict[name], refseq,
                                                 positions_ste))

    # Check activity
    activity = {}
    for name, signature in zip(querysignames, activity_signatures):
        activity[name] = is_active(signature)

    # Predict stereochemistry
    stereochemistry = {}
    for name, signature in zip(querysignames, stereochem_signatures):
        chem = predict_stereochemistry(signature)
        if chem:
            stereochemistry[name] = chem

    return activity, stereochemistry
示例#5
0
def run_minowa(sequence_info: Dict[str, str], startpos: int, muscle_ref: str,
               ref_sequence: str, positions_file: str, data_dir: str,
               hmm_names: List[str]) -> Dict[str, Prediction]:
    """
        Scores query sequences against a set of provided HMM profiles. The scoring
        is calculated by aligning each query against the reference set, then extracting
        a signature by using the sequence positions provided, finally hmmsearch is
        used to compare the signature with the provided set of HMM profiles.

        Arguments:
            sequence_info: a dict mapping sequence id to sequence
            startpos: an int to subtract from those positions in positions_file
            muscle_ref: the path of a file containing reference sequence to align against
            ref_sequence: the reference sequence to base extractions on
            positions_file: the path of a file containing signature extraction positions
            data_dir: the directory containing HMM profiles for the current method
            hmm_names: the names of the HMM profiles for the current method

        Returns:
            an instance of MinowaResults, which is a subclass of dict
                mapping query sequence id to MinowaPrediction
    """
    positions = get_positions(positions_file, startpos)

    results_by_query = {}  # type: Dict[str, Prediction]

    for query_id, query_seq in sequence_info.items():
        muscle = subprocessing.run_muscle_single(query_id, query_seq,
                                                 muscle_ref)

        # count residues in ref sequence and put positions in list
        # extract positions from query sequence and create fasta formatted seq
        # to use as input for hmm searches
        seq = utils.extract_by_reference_positions(muscle[query_id],
                                                   muscle[ref_sequence],
                                                   positions)
        fasta_format = ">%s\n%s\n" % (query_id, seq.replace("-", "X"))

        # then use list to extract positions from every sequence -> HMMs (one time, without any query sequence)
        hmm_scores = {}
        for hmmname in hmm_names:
            hmm_scores[hmmname] = hmmsearch(
                fasta_format, path.join(data_dir, hmmname + ".hmm"))

        results = sorted(hmm_scores.items(),
                         reverse=True,
                         key=lambda x: (x[1], x[0]))
        results_by_query[query_id] = MinowaPrediction(results)
    return results_by_query
示例#6
0
def run_at_domain_analysis(domains: Dict[str, str]) -> ATSignatureResults:
    """ Analyses PKS signature of AT domains

        Arguments:
            domains: a dictionary mapping domain identifier (e.g. 'locus_AT2')
                     to domain sequence

        Returns:
            a dictionary mapping domain identifier to
                a list of ATResults ordered by decreasing score
    """
    # construct the query signatures
    query_signatures = {}
    at_positions = get_at_positions(startpos=7)
    for name, seq in sorted(domains.items()):
        alignments = subprocessing.run_muscle_single(name, seq, _AT_DOMAINS_FILENAME)
        query_signatures[name] = utils.extract_by_reference_positions(alignments[name],
                                         alignments[_REF_SEQUENCE], at_positions)
    # load reference PKS signatures and score queries against them
    return score_signatures(query_signatures, fasta.read_fasta(_SIGNATURES_FILENAME))
示例#7
0
def get_signature(query: str, hmm: str, positions: List[int]) -> str:
    """ Retrieves a signature from an aligned pair based on 1-indexed positions given.

        Arguments:
            query: the sequence of the query that the signature will be extracted from
            hmm: the sequence of the hit, used to adjust positions to account for introduced gaps
            positions: a list of 1-indexed positions to use for the signature
                       positions are relative to hit start
            expected: if provided, a signature extracted from the reference sequence must
                      match this

        Returns:
            None if the provided positions would be out of bounds of the hit,
            otherwise a string of the same length as positions and expected
    """
    ungapped = str(hmm).replace('.', '')
    if max(positions) > len(ungapped):
        # the hit was too small and a correct signature can't be generated
        return ""
    return utils.extract_by_reference_positions(query, hmm, [pos - 1 for pos in positions])
示例#8
0
def extract_nterminus(data_dir: str, cds_features: List[CDSFeature],
                      start_cds: Optional[CDSFeature]) -> Dict[str, str]:
    """ -extract N-terminal 50 residues of each non-starting protein
        -scan for docking domains using hmmsearch
        -parse output to locate interacting residues
    """
    n_terminal_residues = {}
    n_terminals = {}
    nterm_file = os.path.join(data_dir, 'nterm.fasta')
    for cds in cds_features:
        if cds is not start_cds:
            seq = str(cds.translation)
            n_terminals[cds.get_name()] = seq[:50]
    for name, seq in n_terminals.items():
        alignments = subprocessing.run_muscle_single(name, seq, nterm_file)
        query_seq = alignments[name]
        ref_seq = alignments["EryAIII_5_6_ref"]
        n_terminal_residues[name] = utils.extract_by_reference_positions(
            query_seq, ref_seq, [2, 15])
    return n_terminal_residues
示例#9
0
def extract_nterminus(data_dir, genes, start_gene):
    """ -extract N-terminal 50 residues of each non-starting protein
        -scan for docking domains using hmmsearch
        -parse output to locate interacting residues
    """
    n_terminal_residues = {}
    n_terminals = {}
    nterm_file = os.path.join(data_dir, 'nterm.fasta')
    for gene in genes:
        gene_name = gene.get_name()
        if gene_name != start_gene:
            seq = str(gene.translation)
            n_terminals[gene_name] = seq[:50]
    for name, seq in n_terminals.items():
        alignments = subprocessing.run_muscle_single(name, seq, nterm_file)
        query_seq = alignments[name]
        ref_seq = alignments["EryAIII_5_6_ref"]
        n_terminal_residues[name] = utils.extract_by_reference_positions(
            query_seq, ref_seq, [2, 15])
    return n_terminal_residues