Exemplo n.º 1
0
def refold_stockholm(stockholm_alig, consensus_structure):
    """
    compute refold.pl from Vienna RNA package
    :param stockholm_alig:
    :param consensus_structure:
    :return:
    """
    ml.debug(fname())
    # convert to clustal alignment
    fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        stockholm_alig.write_clustal(f)

    # write fake alifold output with given consensus structure
    fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        # the consensus sequence in alifold file is really not used for anything
        f.write('A'*len(consensus_structure) + '\n')
        f.write(consensus_structure + '\n')

    # compute refold
    # refold_path = locate_refold()
    refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file)

    parsed_seqs = []
    with open(refold_constrained_file, 'r') as f:
        # read the file
        for seq in BA_support.parse_seq_str(f):
            parsed_seqs.append(seq)

    # cleanup
    BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file])

    return parsed_seqs
Exemplo n.º 2
0
def run_cmalign_with_scores(fasta_file, cm_file, threads=None):
    fd_sfile, cm_sfile_path = mkstemp(prefix='rba_',
                                      suffix='_29',
                                      dir=CONFIG.tmpdir)
    os.close(fd_sfile)
    if threads:
        cm_params = '--notrunc --cpu {} --sfile {}'.format(
            threads, cm_sfile_path)
    else:
        cm_params = '--notrunc --sfile {}'.format(cm_sfile_path)
    cm_msa_file = run_cmalign_on_fasta(fasta_file,
                                       cm_file,
                                       cmalign_params=cm_params)

    cm_msa = read_st(cm_msa_file)

    # combine the eval and cm_msa_conservation_score
    # the cmalign scores somehow, look into the scoring if those scores are accessible, maybe they are far better then
    # my made up msa_conservation
    # there is - by option --sfile
    cm_align_scores = read_cmalign_sfile(cm_sfile_path)
    # the bit score can be probably directly comparable with blast bit score
    # i can also leverage the fact, that the badly aligned sequences with cmalign have negative bitscore
    # so my score can be
    cm_align_scores.index = range(len(cm_align_scores.index))

    BA_support.remove_files_with_try([cm_sfile_path, cm_msa_file])

    return cm_msa, cm_align_scores
Exemplo n.º 3
0
def _trusted_hits_selection_wrapper(all_hits_,
                                    query_,
                                    cmscore_tr_,
                                    cm_threshold_percent_,
                                    len_diff_=0.1):
    """
    runs basic non_redundant sequences calculation (ie exact sequence match)
    selects homologous sequences from all hits list by cmscore threshold or by query sequence

    behaviour:
        will return distance array with similarities in % including query sequence and list of homologous sequences
        including query sequence

        if no sequence is homologous
        it will return empty array for distance matrix and list with query sequence
    """
    ml.debug(fname())
    msgs = []
    # trusted sequence selection
    # ========================================================
    assert (cmscore_tr_ == 0) or cm_threshold_percent_ is None

    score = _extract_cmscore_from_hom_seqs(all_hits_)

    if cm_threshold_percent_ is not None:
        selection_threshold = cm_threshold_percent_ * query_.annotations[
            'cmstat'].bit_sc / 100
    else:
        selection_threshold = cmscore_tr_

    pred = infer_hits_cm(score, tr=selection_threshold)
    trusted_seqs_ = [i for i, j in zip(all_hits_, pred) if j]

    if len(trusted_seqs_) == 0:
        msg = 'STATUS: No estimated full-length sequences from BLAST output ' \
              'selected as reference for structure prediction.\n' \
              ' Using query sequence as reference.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # add query to trusted sequences
    trusted_seqs_query = [query_] + trusted_seqs_

    # make nr list of sequences -> faster alignment
    # better selection
    nr_trusted_seqs_query = BA_support.non_redundant_seqs(trusted_seqs_query)

    # check if the homologous sequence is not exact match as query
    #  (ie taking non redundant set would be only one sequence)
    if len(nr_trusted_seqs_query) == 1:
        msg = 'STATUS: All sequences selected as reference are exactly same as query sequence.'
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # select only sequences in some predifined length range to query
    # this is needed for longish ncRNAs
    #   tolerate 10 % length difference?
    ref_len = len(query_)
    nr_len_selected_trusted = [
        seq for seq in nr_trusted_seqs_query
        if ref_len * (1 - len_diff_) < len(seq) < ref_len * (1 + len_diff_)
    ]

    # this is to control if only one sequence remained after filtering for length difference
    if len(nr_len_selected_trusted) == 1:
        msg = \
            'No sequence satisfy the length difference condition ({}: {}-{})'.format(
                len_diff_,
                ref_len * (1 - len_diff_),
                ref_len * (1 + len_diff_)
            )
        msgs.append(msg)
        ml.info(msg)
        if ml.level > 20:
            print(msg)
        return np.empty(0), [query_], msgs

    # sanitize seq names (muscle has issues with too long names)
    san_hom_seqs, san_dict = BA_support.sanitize_fasta_names_in_seqrec_list(
        nr_len_selected_trusted)

    c_fd, trusted_sequence_file_ = mkstemp(prefix='rba_',
                                           suffix='_60',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(c_fd, 'w') as f:
        SeqIO.write(san_hom_seqs, f, 'fasta')

    align_file = BA_support.run_muscle(trusted_sequence_file_, reorder=True)
    alig = AlignIO.read(align_file, format='clustal')
    distance_calc = DistanceCalculator(model='identity')
    dist_mat = distance_calc.get_distance(alig)
    # rebuild index from sanitized
    orig_index = [san_dict[i] for i in dist_mat.names]
    dist_mat_pd = pandas.DataFrame.from_records(dist_mat.matrix,
                                                index=orig_index)
    dist_table_ = (1 - dist_mat_pd.values) * 100

    BA_support.remove_files_with_try([align_file, trusted_sequence_file_])
    return dist_table_, trusted_seqs_query, msgs