Exemplo n.º 1
0
def rfam_subopt_pred(all_sequence_fasta,
                     cm_ref_str,
                     params=None,
                     threads=1,
                     timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    if params and ('mfold' in params) and params['mfold']:
        assert isinstance(params['mfold'], (tuple, list)) and 3 == len(params['mfold']), \
            "Incorrect parameters for hybrid_ss_min given. Need tuple of 3 numbers."
        subs = run_hybrid_ss_min(all_sequence_fasta,
                                 mfold=params['mfold'],
                                 threads=threads,
                                 timeout=timeout)
    else:
        subs = run_hybrid_ss_min(all_sequence_fasta,
                                 threads=threads,
                                 timeout=timeout)

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(
                _helper_subopt(seq, cm_ref_str, timeout=timeout))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, cm_ref_str) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    return new_structures
Exemplo n.º 2
0
def select_sequences_from_similarity_rec(dist_mat: np.ndarray,
                                         sim_threshold_percent=90) -> list:
    """
    :param dist_mat: distmat table, by default obtained from read_clustal_distmat_file, values in percent
    :param sim_threshold_percent: threshold for similarity in percent
    :return:
    """
    ml.debug(fname())
    # dists = np.triu(dist_mat.as_matrix(), 1)          # removes unwanted similarities
    if dist_mat is None:
        return [0]
    dists = dist_mat.transpose()
    # row, col = where(dists > sim_threshold_percent) # determine where the similarities are
    include = set()
    exclude = set()
    a = np.array(range(len(dists)))
    for i, r in enumerate(dists):
        pr = r[~np.isnan(r)]
        pa = a[~np.isnan(r)]
        if (i in exclude) | (any(pr >= sim_threshold_percent)):
            pu = np.where(pr >= sim_threshold_percent)
            u = pa[pu]
            if i not in exclude:
                include |= {i}
            to_ex = set(u.tolist()) - include
            exclude |= to_ex  # union operation
        else:
            include |= {i}

    return sorted(include)
Exemplo n.º 3
0
def _aligner_block(nr_homolog_hits_file, params, msa_alg, threads=None):
    """
    returns alignment file in clustal format
    :param nr_homolog_hits_file:
    :param params:
    :param msa_alg:
    :param threads: int
    :return:
    """
    ml.debug(fname())
    if msa_alg == 'clustalo':
        clustal_params = '--outfmt=clustal --force'
        clustal_params += params.get('clustalo', '')

        if threads:
            clustal_params += ' --threads={}'.format(threads)
        alig_file = compute_clustalo_clasic(nr_homolog_hits_file,
                                            clustalo_params=clustal_params)

    elif msa_alg == 'muscle':
        if params and ('muscle' in params) and params['muscle']:
            alig_file = run_muscle(nr_homolog_hits_file,
                                   muscle_params=params['muscle'],
                                   reorder=True)
        else:
            alig_file = run_muscle(nr_homolog_hits_file, reorder=True)

    else:
        print('invalig MSA alg chosen {}, valid are "clustalo" and "muscle"'.
              format(msa_alg))
        raise AttributeError()

    return alig_file
Exemplo n.º 4
0
def subopt_fold_alifold(all_fasta_hits_file,
                        homologs_file,
                        aligner='muscle',
                        params=None,
                        threads=None):
    """
    run clustal/muscle on selected homologs file
    :return:
    """
    ml.debug(fname())
    if params is None:
        params = dict()
    # run aligner
    # =================================================================================================================
    if 'clustalo' == aligner:
        clustal_params = ' --outfmt=clustal --force'
        clustal_params += params.get('clustalo', '')

        if threads:
            clustal_params += ' --threads={}'.format(threads)
        alig_file = compute_clustalo_clasic(homologs_file,
                                            clustalo_params=clustal_params)

    elif 'muscle' == aligner:
        alig_file = run_muscle(homologs_file,
                               muscle_params=params.get('muscle', ''),
                               reorder=False)

    else:
        raise KeyError(
            'provided key ({}) not recognized - avalible: "clustalo" "muscle"'.
            format(aligner))

    # run consensus prediction
    # =================================================================================================================
    alif_file = compute_alifold(alig_file,
                                alifold_params=params.get('alifold', ''))

    # possibly need to decode alifold structure
    alif_str = read_seq_str(alif_file)[0]
    consensus_structure = alif_str.letter_annotations['ss0']

    subs = run_hybrid_ss_min(all_fasta_hits_file,
                             mfold=params.get('mfold', (10, 2, 20)),
                             threads=threads)

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(_helper_subopt(seq, consensus_structure))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, consensus_structure) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    remove_files_with_try([alif_file, alig_file])
    return new_structures
Exemplo n.º 5
0
def subopt_fold_query(all_fasta_hits_file,
                      query_file,
                      params=None,
                      threads=1,
                      timeout=None):
    """
    use folded query sequence as a reference,
    fold all sequences by Unafold, then select structure most similar to query

    accepted parameters:
    "rnafold"
    "mfold"

    :return:
    """
    ml.debug(fname())
    if params is None:
        params = dict()

    # get single query structure
    query_structure = rnafold_prediction(query_file,
                                         params.get('RNAfold', ''),
                                         timeout=timeout)

    if params and ('mfold' in params) and params['mfold']:
        assert isinstance(params['mfold'],
                          (tuple, list)) and 3 == len(params['mfold'])
        subs = run_hybrid_ss_min(all_fasta_hits_file,
                                 mfold=params['mfold'],
                                 threads=threads,
                                 timeout=timeout)
    else:
        subs = run_hybrid_ss_min(all_fasta_hits_file,
                                 threads=threads,
                                 timeout=timeout)

    qs_string = query_structure[0].letter_annotations[
        query_structure[0].annotations['sss'][0]]

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(
                _helper_subopt(seq, qs_string, timeout=timeout))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, qs_string) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    return new_structures
Exemplo n.º 6
0
def rnafold_prediction(fasta2predict, params='', timeout=None):
    ml.debug(fname())
    fd, structure_output_file = mkstemp(prefix='rba_',
                                        suffix='_54',
                                        dir=CONFIG.tmpdir)
    os.close(fd)

    structure_output_file = rnafold_fasta(fasta2predict,
                                          structure_output_file,
                                          params,
                                          timeout=timeout)

    structures = read_seq_str(structure_output_file)
    remove_one_file_with_try(structure_output_file)
    return structures
Exemplo n.º 7
0
def run_clustal_profile2seqs_align(msa_file,
                                   fasta_seq_file,
                                   clustalo_params='',
                                   outfile=None):
    """
    run clustal align MSA to seqs
    aligned columns in input MSA file are preserved and only new sequences are aligned and together they form new
     alignment
    :param msa_file: msa file (works with stockholm)
    :param fasta_seq_file: file with sequences to be aligned (format can be enforced with --infmt in clustalo_params)
    :param clustalo_params: params as accepted by clustalo
    :param outfile: outfile path, if not provided, tempfile will be created with output
    :return: outfile MSA path
    """
    ml.info('Runing clustalo profile.')
    ml.debug(fname())

    def _try_rescue(profile_file):
        # beware AlignIO truncates sequence names so they become non-unique, then clustalo also fails
        ml.warning(
            'Trying rescue for profile alignment if profile has no gaps, sequences appears not aligned. '
            'Appending trailing gap to overcome the issue.')
        a = AlignIO.read(profile_file, format='clustal')
        s = [SeqRecord(Seq(str(i.seq) + '-'), id=i.id) for i in a]
        fa = AlignIO.MultipleSeqAlignment(s)

        fd, temp = mkstemp(prefix='rba_', suffix='_56', dir=CONFIG.tmpdir)
        with os.fdopen(fd, 'w') as fh:
            AlignIO.write(fa, fh, format='fasta')
        return temp

    if outfile:
        clustalo_file = outfile
    else:
        c_fd, clustalo_file = mkstemp(prefix='rba_',
                                      suffix='_57',
                                      dir=CONFIG.tmpdir)
        os.close(c_fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = [
            '{}clustalo'.format(CONFIG.clustal_path), '--force', '-i',
            fasta_seq_file, '--profile1', msa_file, '-o', clustalo_file
        ]
        if clustalo_params != '':
            cmd += clustalo_params.split()

        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            ml.warning('Profile align failed.')

            # Initiate rescue attempt
            rewriten_msa = _try_rescue(msa_file)
            cmd2 = [
                '{}clustalo'.format(CONFIG.clustal_path), '--force', '-i',
                fasta_seq_file, '--profile1', rewriten_msa, '-o', clustalo_file
            ]
            if clustalo_params:
                cmd2 += clustalo_params.split()

            ml.debug(cmd2)
            r2 = call(cmd2, stdout=tmp, stderr=tmp)

            remove_one_file_with_try(rewriten_msa)

            if r2 != 0:
                msgfail = 'Call to clustalo for aligning profile to sequences failed.'
                ml.error(msgfail)
                ml.error(cmd)
                ml.error(cmd2)
                raise exceptions.ClustaloException(msgfail, tmp.read())
    return clustalo_file
Exemplo n.º 8
0
def cmmodel_rnafold_c(allhits_fasta,
                      cmmodel_file,
                      threads=None,
                      params=None,
                      timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta)

    cmalign_params = ''
    if threads:
        cmalign_params += '--cpu {}'.format(threads)

    if 'cmalign' in params and params['cmalign']:
        cmalign_params += ' ' + params['cmalign']

    if '--notrunc' not in cmalign_params:
        cmalign_params += ' --notrunc'

    # rnafold params
    rnafold_params = params.get('RNAfold', '-C')
    assert isinstance(rnafold_params,
                      str), "Incorrect parameters for RNAfold -C"
    if '-C' not in rnafold_params:
        # some parameters given but -C not present
        rnafold_params += ' -C'

    alig_file = run_cmalign_on_fasta(allhits_fasta_file,
                                     cmmodel_file,
                                     cmalign_params=cmalign_params,
                                     timeout=timeout)
    # multiple sequence cm align
    # split by sequence, then run the rest
    cm_alig = read_st(alig_file)

    remove_files_with_try([allhits_fasta_file, alig_file])

    # ===== use refold.pl directly ====
    cm_alig_upper = cm_alig.to_upper()
    fd, temp_mock_consensus = mkstemp(prefix='rba_',
                                      suffix='_41',
                                      dir=CONFIG.tmpdir)
    f, temp_clustal_aln = mkstemp(prefix='rba_',
                                  suffix='_42',
                                  dir=CONFIG.tmpdir)
    with os.fdopen(f, 'w') as h_clustal, os.fdopen(fd, 'w') as h_constraints:
        cm_alig_upper.write_clustal(h_clustal)

        h_constraints.write('{}\n{}\n'.format(
            re.sub('[^ACTGU]',
                   '_',
                   cm_alig_upper.column_annotations['RF'],
                   flags=re.IGNORECASE),
            cm_strucutre2br(cm_alig_upper.column_annotations['SS_cons'])))

    temp_constraint_file = compute_refold(temp_clustal_aln,
                                          temp_mock_consensus,
                                          timeout=timeout)
    structures = rnafold_prediction(temp_constraint_file,
                                    params=rnafold_params,
                                    timeout=timeout)
    str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict)

    remove_files_with_try(
        [temp_constraint_file, temp_clustal_aln, temp_mock_consensus])

    return str_out
Exemplo n.º 9
0
def alifold_refold_prediction(nr_homologs_hits_fasta,
                              all_hits_fasta,
                              refold='refold',
                              threads=None,
                              params=None,
                              msa_alg='clustalo'):
    """
    return predicted structures for all hits based on provided sequence homologs
    ! beware, clustal mixes order of sequences in profile alignment, correct for it
    possible param keys: "clustal", "alifold", "clustalo_profile", "repred_unpaired_tr"
    """
    ml.debug(fname())
    nr_path, san_dict = sanitize_fasta_file(nr_homologs_hits_fasta)
    all_path, san_dict = sanitize_fasta_file(all_hits_fasta,
                                             used_dict=san_dict)

    if params is None:
        params = dict()

    ref_pred = ['refold', 'refold_rnafoldc', 'conserved_ss_rnafoldc']
    if refold not in ref_pred:
        raise Exception(
            'refold procedure not recognized: {}, possible values are {}'.
            format(refold, ' '.join(ref_pred)))

    cl_file = _aligner_block(nr_path, params, msa_alg, threads)

    # cannot rely on that, the order of a cl_file would be the same as the order of the nr_homolog_hits_file
    ali_file = compute_alifold(cl_file,
                               alifold_params=params.get('alifold', ''))

    consensus_record = read_seq_str(ali_file)[0]

    clustalo_profile_params = '--outfmt clustal '
    clustalo_profile_params += params.get('clustalo_profile', '')
    if threads:
        clustalo_profile_params += ' --threads {}'.format(threads)
    realign_file = run_clustal_profile2seqs_align(
        cl_file, all_path, clustalo_params=clustalo_profile_params)
    realign_alig = AlignIO.read(realign_file, format='clustal')

    # slice alignment ( get seqname from nr_homolog_hits_file, find it in the realign and slice the whole segment off
    #  take care that the id may be the same and it must be checked for multiple occurence

    first_nr_record = _parse_first_record_only(nr_path)

    realign_allseq_possition = [
        i for i, seq in enumerate(realign_alig) if seq.id == first_nr_record.id
    ]

    new_alig_for_refold = realign_alig[:realign_allseq_possition[-1]]
    old_alig_in_new = realign_alig[realign_allseq_possition[-1]:]

    orig_alignment = AlignIO.read(cl_file, format='clustal')

    first_original_alignment_record = orig_alignment[0]

    match_original_seq_in_new_alig = [
        i for i in old_alig_in_new
        if i.id == first_original_alignment_record.id
    ][0]

    mapping = _map_alignment_columns_from_profile_match(
        first_original_alignment_record, match_original_seq_in_new_alig)

    # map and repair structure when mapping is unbiguous
    cs_encode = encode_structure_unicode(
        consensus_record.letter_annotations['ss0'])
    new_consensus_structure_encoded = _repair_consensus_structure_by_maping(
        cs_encode,
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=49)
    new_consensus_structure_repaired = repair_structure_any_variant(
        new_consensus_structure_encoded)

    new_consensus_structure = decode_structure_unicode(
        new_consensus_structure_repaired)

    new_consensus_sequence_list = _repair_consensus_structure_by_maping(
        [ord(i) for i in consensus_record.seq],
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=ord('_'))
    new_consensus_sequence = ''.join(
        chr(i) for i in new_consensus_sequence_list)

    # write new consensus to a file
    a_fd, new_alifold_consensus_file = mkstemp(prefix='rba_',
                                               suffix='_33',
                                               dir=CONFIG.tmpdir)
    with os.fdopen(a_fd, 'w') as f:
        f.write(new_consensus_sequence + '\n')
        f.write(new_consensus_structure + '\n')

    # write sliced alignment to a file
    sa_fd, sliced_alignment_file = mkstemp(prefix='rba_',
                                           suffix='_34',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(sa_fd, 'w') as f:
        AlignIO.write(new_alig_for_refold, f, 'clustal')

    # now process the file, and map alignment to consensus structure
    if refold in ['refold', 'refold_rnafoldc']:
        refold_file = compute_refold(sliced_alignment_file,
                                     new_alifold_consensus_file)

        if refold == 'refold_rnafoldc':
            rnafold_parameters = params.get('RNAfold', '')
            if '-C' not in rnafold_parameters:
                rnafold_parameters += ' -C'

            seq_str = rnafold_prediction(refold_file,
                                         params=rnafold_parameters)

        else:
            seq_str = read_seq_str(refold_file)

        remove_one_file_with_try(refold_file)

    else:
        st_alig_file = build_stockholm_from_clustal_alig(
            sliced_alignment_file, new_alifold_consensus_file)
        repred_tr = str(params.get('repred_unpaired_tr', '9'))
        conseq_conserved = params.get('conseq_conserved', 1)

        seq_str = _refold_with_unpaired_conservation(
            st_alig_file,
            repred_tr=repred_tr,
            conseq_conserved=conseq_conserved)
        remove_one_file_with_try(st_alig_file)

    structures_out = desanitize_fasta_names_in_seqrec_list(seq_str, san_dict)

    remove_files_with_try([
        nr_path, all_path, sliced_alignment_file, new_alifold_consensus_file,
        cl_file, ali_file, realign_file
    ])

    return structures_out