示例#1
0
def me_centroid_homfold(fasta2predict, fasta_homologous_seqs, params=None):
    """
    run centroid_homefold several times and vary -g parameter, to predict the best possible structure
    :param fasta2predict:
    :param fasta_homologous_seqs:
    :param params:
    :return:
    """

    # first run centroid homefold for several stages of g (-1)
    # find the most stable structure value of g
    # structure of output

    if params is None:
        params = dict()

    ch_params = ''
    if params and ('centroid_homfold' in params) and params['centroid_homfold']:
        ch_params += params['centroid_homfold']

    if '-g ' in ch_params and '-g -1' not in ch_params or '-t ' in params:
        print("We only allow to run centroid homfold in automatic mode where the structure is predicted with multiple"
              " weights and then best scoring structure is selected, threshold's are also forbidden as it implies -g.")
        raise AttributeError('Centroid homfold is not permitted to run with "-g" or "-t".')
    ch_params += ' -g -1'

    first_structures = run_centroid_homfold(fasta2predict, fasta_homologous_seqs, centroid_homfold_params=ch_params)
    structures2return = [ch_struc for ch_struc in centroid_homfold_select_best(first_structures)]
    BA_support.remove_one_file_with_try(first_structures)
    return structures2return
示例#2
0
def build_cm_model_rsearch(query_seq, path2selected_sim_array):
    ml.debug(fname())
    query_structure = rna_blast_analyze.BR_core.viennaRNA.RNAfold(
        str(query_seq.seq))[0]

    # remove any annotations from query:
    qs_clean = deepcopy(query_seq)
    qs_clean.annotations = dict()
    qs_clean.letter_annotations = dict()

    # query_structure = RNA.fold(str(analyzed_hits.query.seq))[0]
    # build stockholm like file for use in cm mohdel build
    st_like = StockholmAlig()
    st_like.append(qs_clean)
    st_like.column_annotations['SS_cons'] = query_structure

    fds, stock_file = mkstemp(prefix='rba_', suffix='_30', dir=CONFIG.tmpdir)
    with os.fdopen(fds, 'w') as f:
        st_like.write_stockholm(f)

    # run actual cmbuild
    cm_model_file = run_cmbuild(
        stock_file,
        cmbuild_params='--rsearch {}'.format(path2selected_sim_array))

    # cleanup
    BA_support.remove_one_file_with_try(stock_file)
    return cm_model_file
示例#3
0
def centroid_homfold_fast(all_seqs, query, all_seqs_fasta, n, centroid_homfold_params, len_diff):
    ml.debug(fname())

    selected_seqs = centroid_homfold_fast_prep(all_seqs, query, n, len_diff)

    ch, homologous_file = mkstemp(prefix='rba_', suffix='_74', dir=CONFIG.tmpdir)
    with os.fdopen(ch, 'w') as h:
        SeqIO.write(selected_seqs, h, 'fasta')

    structures, _ = me_centroid_homfold(all_seqs_fasta, homologous_file, params=centroid_homfold_params)
    BA_support.remove_one_file_with_try(homologous_file)
    return structures
def extend_meta_core(analyzed_hits,
                     query,
                     args_inner,
                     all_short,
                     multi_query,
                     iteration,
                     ih_model,
                     timeout=None):
    ml.debug(fname())
    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    blast_args = deepcopy(args_inner)
    locarna_args = deepcopy(args_inner)
    b_all_short = deepcopy(all_short)
    l_all_short = deepcopy(all_short)

    if args_inner.repredict_file is None:
        fd, repred_file = mkstemp(prefix='rba_',
                                  suffix='_18',
                                  dir=CONFIG.tmpdir)
        os.close(fd)
    else:
        repred_file = args_inner.repredict_file

    for i, args in enumerate([blast_args, locarna_args]):
        args.prediction_method = []
        args.pred_params = dict()
        args.dump = None
        args.pdf_out = None
        args.pandas_dump = None
        args.repredict_file = repred_file + str(i)
        args.dev_pred = False
        args.logfile = None
        args.json = None
        args.html = None
        args.cm_file = ih_model

    analyzed_hits_simple = deepcopy(analyzed_hits)
    analyzed_hits_locarna = deepcopy(analyzed_hits)

    analyzed_hits_simple, _, _, _ = extend_simple_core(analyzed_hits_simple,
                                                       query, blast_args,
                                                       b_all_short,
                                                       multi_query, iteration,
                                                       ih_model)
    analyzed_hits_locarna, _, _, _ = extend_locarna_core(analyzed_hits_locarna,
                                                         query,
                                                         locarna_args,
                                                         l_all_short,
                                                         multi_query,
                                                         iteration,
                                                         ih_model,
                                                         timeout=timeout)

    # add cmstat to query
    analyzed_hits.query = analyzed_hits_simple.query

    order_out = []

    b_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits}
    l_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits}
    ok_keys = sorted(set(b_dict.keys()) | set(l_dict.keys()))
    for inum in ok_keys:
        bh = b_dict.get(inum, None)
        lh = l_dict.get(inum, None)

        hits = [bh, lh]
        # fallback to simple if locarna returned empty hit
        # deal with the situation when both ways returned empty hits
        filtered_hits = [h for h in hits if h is not None]
        if len(filtered_hits) == 1:
            msg = 'Only one extension method completed successfully for {}. ' \
                  'Choosing the successfully extended sequence to the output.'.format(filtered_hits[0].extension.id)
            ml.info(msg)
            if ml.getEffectiveLevel() < 20:
                print(msg)
            analyzed_hits.hits.append(filtered_hits[0])
            continue
        elif len(filtered_hits) == 0:
            # append empty extension
            analyzed_hits.hits_failed.append(lh)
            continue

        bit_scores = [
            i.extension.annotations['cmstat']['bit_sc'] for i in hits
        ]

        mb = max(bit_scores)
        bit_index = [i for i, j in enumerate(bit_scores) if j == mb][0]
        order_out.append(bit_index)

        analyzed_hits.hits.append(hits[bit_index])

    # build failed hits
    b_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_simple.hits_failed
    }
    l_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_locarna.hits_failed
    }
    for inum in sorted(set(b_dict_failed) | set(l_dict_failed)):
        if inum not in ok_keys:
            if inum in b_dict_failed:
                analyzed_hits.hits_failed.append(b_dict_failed[inum])
            elif inum in l_dict_failed:
                analyzed_hits.hits_failed.append(l_dict_failed[inum])
            else:
                raise KeyError(
                    "Failed to find inum key in failed extensions. This should not happen."
                )

    # build the repredict file here if needed
    if args_inner.repredict_file:
        b_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        l_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        o_repredict = BA_support.iter2file_name(args_inner.repredict_file,
                                                multi_query, iteration)
        with open(b_repredict,
                  'r') as barf, open(l_repredict,
                                     'r') as larf, open(o_repredict,
                                                        'w') as reprf:
            """
            please note that order of files to merge must be same as the order of methods in previous for cycle
            ie same as the one in which order_out var is set
            """
            bb = (barf, larf)

            fl = bb[0].readline()
            reprf.write(fl)
            fl = bb[0].readline()
            reprf.write(fl)
            # dump first line of the other documents
            [[i.readline() for _ in range(1)] for i in bb[1:]]

            for o in order_out:
                lll = [i.readline() for i in bb]
                reprf.write(lll[o])

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # remove description from hits and sources
    for hit in analyzed_hits.hits:
        hit.extension.description = ''

    if args_inner.cm_file or args_inner.use_rfam:
        cm_file_rfam_user = ih_model
    else:
        cm_file_rfam_user = None
        BA_support.remove_one_file_with_try(ih_model)
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user
示例#5
0
def infer_homology(analyzed_hits,
                   args,
                   cm_model_file,
                   multi_query=False,
                   iteration=0):
    """
    This is wrapper for infer homology methods. It deals with different options for generating CM.
    :return:
    """
    ml.info('Infering homology...')
    ml.debug(fname())
    bits, eval, loc_score, alig_length = hit_cons_characteristic(
        analyzed_hits.hits)

    # always run cmscan on rfam for informative reasons
    #  but use inferred CM only if --use_rfam was given
    #  if CM provided, also run inference but use provided file
    # print explanation alongside this information

    # find and extract cm model
    # This code is moved to each extension method to allow fail-fast if model is found in RFAM
    # cm_model_file, analyzed_hits = find_and_extract_cm_model(args, analyzed_hits)

    # include query seq in fasta file to get relevant bit score
    fd_f, fd_fasta = mkstemp(prefix='rba_', suffix='_28', dir=CONFIG.tmpdir)
    with os.fdopen(fd_f, 'w') as f:
        for seq in [analyzed_hits.query] + analyzed_hits.res_2_record_list():
            f.write('>{}\n{}\n'.format(seq.id, str(seq.seq)))

    cm_msa, cm_align_scores = run_cmalign_with_scores(fd_fasta,
                                                      cm_model_file,
                                                      threads=args.threads)

    _add_rsearch_align_scores2anal_hits(analyzed_hits, cm_align_scores)

    # remove first 1 (query) from the prediction scores
    prediction = infer_hits_cm(cm_align_scores[1:].bit_sc)

    # write scores to a table, compute it for all data and run some correlation statistics
    if args.repredict_file:
        # note that the first score is for the query and act as a benchmark here
        cm_msa_conservation = alignment_sequence_conservation(cm_msa,
                                                              gap_chars='-.')

        repredict_file = BA_support.iter2file_name(args.repredict_file,
                                                   multi_query, iteration)
        with open(repredict_file, 'w') as f:
            _print_table_for_corelation(f, cm_align_scores.seq_name[1:], bits,
                                        eval, loc_score, alig_length,
                                        cm_msa_conservation[1:],
                                        cm_align_scores.bit_sc[1:],
                                        cm_msa_conservation[0],
                                        cm_align_scores.bit_sc[0])

    BA_support.remove_one_file_with_try(fd_fasta)

    selected_hits = [
        hit.extension for b, hit in zip(prediction, analyzed_hits.hits) if b
    ]

    if args.cm_file or args.use_rfam:
        r_cm_file = cm_model_file
    else:
        r_cm_file = None
        BA_support.remove_one_file_with_try(cm_model_file)

    return prediction, selected_hits, r_cm_file
示例#6
0
def locarna_worker(pack):
    ml.debug(fname())
    one_expanded_hit, query_seq, locarna_params, anchor_length = pack

    locarna_file1 = locarna_file2 = loc_out_file = None

    try:
        # read the aligned segment and use it as anchors for locarna
        # run locarna in local mode and put the query sequence with the extended sequence with the blast aligned
        # segment as anchor
        blast_entry = one_expanded_hit.annotations['blast'][1]

        anchors = LocarnaAnchor(
            to_rna(blast_entry.query),
            blast_entry.match,
            to_rna(blast_entry.sbjct),
            anchor_length=anchor_length
        )

        if anchors.too_many_anchors:
            ml.info('Too many anchors for {}. Can handle up to 520 distinct anchors.'.format(one_expanded_hit.id))
        # extracted temp is my query

        # access the locarna aligner directly
        fd1, locarna_file1 = mkstemp(prefix='rba_', suffix='_20', dir=CONFIG.tmpdir)
        with os.fdopen(fd1, 'w') as fp_locarna_file_1:
            ql1, ql2 = anchors.anchor_whole_seq(str(query_seq), 'query')
            write_clustal_like_file_with_anchors(fp_locarna_file_1,
                                                 'query',
                                                 str(query_seq),
                                                 (
                                                     ('#A1', ql1.split()[0]),
                                                     ('#A2', ql2.split()[0])
                                                 ))

        fd2, locarna_file2 = mkstemp(prefix='rba_', suffix='_21', dir=CONFIG.tmpdir)
        with os.fdopen(fd2, 'w') as fp_locarna_file_2:
            sl1, sl2 = anchors.anchor_whole_seq(str(one_expanded_hit.seq), 'subject')
            write_clustal_like_file_with_anchors(fp_locarna_file_2,
                                                 one_expanded_hit.id,
                                                 str(one_expanded_hit.seq),
                                                 (
                                                     ('#A1', sl1.split()[0]),
                                                     ('#A2', sl2.split()[0])
                                                 ))

        loc_out_file = run_locarna(
            locarna_file1,
            locarna_file2,
            locarna_params
        )

        # read locarna alignment
        with open(loc_out_file, 'r') as f:
            locarna_alig = parse_locarna_alignment(f)

        if len(locarna_alig) != 2:
            raise exceptions.SubseqMatchError('There must be 2 sequences in Locarna alignment.')

        loc_rep = create_report_object_from_locarna(one_expanded_hit, locarna_alig)

        return loc_rep
    except exceptions.LocarnaException as e:
        one_expanded_hit.annotations['msgs'] = [str(e), e.errors]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (exceptions.SubseqMatchError, exceptions.ParsingError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    except (TypeError, AttributeError, FileNotFoundError) as e:
        one_expanded_hit.annotations['msgs'] = [str(e)]
        empty_hit = BA_support.Subsequences(one_expanded_hit)
        return empty_hit
    finally:
        for f in [locarna_file1, locarna_file2, loc_out_file]:
            if f is not None:
                BA_support.remove_one_file_with_try(f)
示例#7
0
def repredict_structures_for_homol_seqs(
    query,
    seqs2predict_fasta,
    threads=None,
    prediction_method=None,
    pred_method_params=None,
    all_hits_list=None,
    seqs2predict_list=None,
    use_cm_file=None,
):
    """Run RNA structure prediction based on chosen method and parameters.
    """

    default_sim_tr_perc = 90
    default_score_tr = 0.0
    query_max_len_diff = 0.1

    try:
        if 'default' == prediction_method:
            # do nothing
            return None, None, []

        elif 'rfam-Rc' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                structures, exec_time = cmmodel_rnafold_c(
                    seqs2predict_fasta,
                    use_cm_file,
                    threads=threads,
                    params=pred_method_params.get(prediction_method, {}))
                return structures, exec_time, []

        elif 'rfam-centroid' == prediction_method:
            # run cmscan if needed
            # run cmfetch
            # run cmemit -> homologous seqs
            # run centroid_homfold

            method_parameters = pred_method_params.get(prediction_method, {})
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                cep = method_parameters.get('cmemit', '')
                if '-u' not in cep:
                    cep += ' -u'
                if '-N' not in cep:
                    cep += ' -N {}'.format(method_parameters.get('n_seqs', 10))

                hf_file = run_cmemit(use_cm_file, params=cep)

                structures, exec_time = me_centroid_homfold(
                    seqs2predict_fasta, hf_file, params=method_parameters)

                BA_support.remove_one_file_with_try(hf_file)
                return structures, exec_time, []

        elif 'rfam-sub' == prediction_method:
            if use_cm_file is None:
                msg = "No CM model. Can't use {}.".format(prediction_method)
                ml.warning(msg)
                return None, None, [msg]
            else:
                ref_structure = extract_ref_from_cm(use_cm_file)

                structures, exec_time = rfam_subopt_pred(
                    seqs2predict_fasta,
                    ref_structure,
                    params=pred_method_params.get(prediction_method, None),
                    threads=threads,
                )
                return structures, exec_time, []

        elif 'rnafold' == prediction_method:
            structures, exec_time = rnafold_wrap_for_predict(
                seqs2predict_fasta,
                params=pred_method_params.get(prediction_method,
                                              {}).get('RNAfold', ''))
            return structures, exec_time, []

        elif 'fq-sub' == prediction_method:
            a, qf = mkstemp(prefix='rba_', suffix='_55', dir=CONFIG.tmpdir)
            with os.fdopen(a, 'w') as fd:
                fd.write('>query\n{}\n'.format(str(query.seq)))

            structures, exec_time = subopt_fold_query(
                seqs2predict_fasta,
                qf,
                params=pred_method_params.get(prediction_method, None),
                threads=threads)
            BA_support.remove_one_file_with_try(qf)
            return structures, exec_time, []

        elif 'C-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_64',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='clustalo',
                params=method_parameters,
                threads=threads)
            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'M-A-sub' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, homologous_seqs, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file

            f, homologous_sequence_file = mkstemp(prefix='rba_',
                                                  suffix='_65',
                                                  dir=CONFIG.tmpdir)
            with os.fdopen(f, 'w') as fh:
                SeqIO.write(homologous_seqs, fh, 'fasta')

            structures, exec_time = subopt_fold_alifold(
                seqs2predict_fasta,
                homologous_sequence_file,
                aligner='muscle',
                params=method_parameters,
                threads=threads,
            )

            BA_support.remove_one_file_with_try(homologous_sequence_file)
            del homologous_sequence_file
            del homologous_seqs
            return structures, exec_time, msgs

        elif 'C-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='refold_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'C-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='clustalo')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'M-A-U-r-Rc' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_trusted_hits_file_MSA_safe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )
            structures, exec_time = alifold_refold_prediction(
                nr_homo_hits_file,
                seqs2predict_fasta,
                refold='conserved_ss_rnafoldc',
                threads=threads,
                params=method_parameters,
                msa_alg='muscle')

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'centroid' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_list,
                query=query,
                sim_threshold_percent=method_parameters.get(
                    'pred_sim_threshold', default_sim_tr_perc),
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            raw_structures, exec_time = me_centroid_homfold(
                seqs2predict_fasta,
                nr_homo_hits_file,
                params=method_parameters)

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del nr_homo_hits_file
            return raw_structures, exec_time, msgs

        elif 'centroid-fast' == prediction_method:
            method_parameters = pred_method_params.get(prediction_method, {})
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException

            raw_structures, exec_time = centroid_homfold_fast(
                all_seqs=all_hits_list,
                query=query,
                all_seqs_fasta=seqs2predict_fasta,
                n=method_parameters.get('max_seqs_in_prediction', 10),
                centroid_homfold_params=method_parameters,
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff))

            # check noncanonical
            if prediction_method in pred_method_params and pred_method_params[
                    prediction_method]:
                allow_nc = pred_method_params[prediction_method].get(
                    'allow_noncanonical', False)
                allow_lp = pred_method_params[prediction_method].get(
                    'allow_lonely_pairs', False)
            else:
                allow_nc = False
                allow_lp = False
            if not allow_nc:
                for seq in raw_structures:
                    repstr = find_nc_and_remove(
                        str(seq.seq), structure=seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            # check lonely basepairs
            if not allow_lp:
                for seq in raw_structures:
                    repstr = check_lonely_bp(seq.letter_annotations['ss0'])
                    seq.letter_annotations['ss0'] = repstr

            return raw_structures, exec_time, []

        elif 'TurboFold' == prediction_method:
            # set arbitrary sim_threshold_percent to 100, because we want to remove only identical sequences from prediction
            #  with TurboFold. The structure of redundant sequences will be set according to the one in prediction
            all_hits_filtered = BA_support.filter_ambiguous_seqs_from_list(
                all_hits_list)
            seqs2predict_filtered = BA_support.filter_ambiguous_seqs_from_list(
                seqs2predict_list)
            if len(seqs2predict_list) != len(seqs2predict_filtered):
                ml.warning(
                    'Some sequences contain ambiguous bases - they will not be predicted.'
                )

            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            method_parameters = pred_method_params.get(prediction_method, {})

            nr_homo_hits_file, _, msgs = create_nr_homolog_hits_file_MSA_unsafe(
                all_hits=all_hits_filtered,
                query=query,
                sim_threshold_percent=100,
                cmscore_tr=method_parameters.get('cmscore_tr',
                                                 default_score_tr),
                cm_threshold_percent=method_parameters.get(
                    'cmscore_percent', None),
                len_diff=method_parameters.get('query_max_len_diff',
                                               query_max_len_diff),
            )

            with open(nr_homo_hits_file, 'r') as nrf:
                nr_homo_hits = [
                    seq for seq in SeqIO.parse(nrf, format='fasta')
                ]

            nh = sha1()
            nh.update(str(sorted(method_parameters.items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_with_homologous(
                all_sequences=seqs2predict_filtered,
                nr_homologous=nr_homo_hits,
                params=method_parameters.get('TurboFold', {}),
                n=method_parameters.get('max_seqs_in_prediction', 3),
                cpu=threads,
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            BA_support.remove_one_file_with_try(nr_homo_hits_file)
            del structures_t
            del nr_homo_hits
            del nr_homo_hits_file
            return structures, exec_time, msgs

        elif 'Turbo-fast' == prediction_method:
            if query.annotations['ambiguous']:
                raise exceptions.AmbiguousQuerySequenceException()

            nh = sha1()
            nh.update(
                str(
                    sorted(
                        pred_method_params.get(prediction_method,
                                               {}).items())).encode())
            nh_str = nh.hexdigest()

            structures_t, exec_time = turbofold_fast(
                all_seqs=all_hits_list,
                seqs2predict=seqs2predict_list,
                query=query,
                cpu=threads,
                n=pred_method_params.get(prediction_method,
                                         {}).get('max_seqs_in_prediction', 3),
                turbofold_params=pred_method_params.get(prediction_method,
                                                        {}).get(
                                                            'TurboFold', {}),
                len_diff=pred_method_params.get(prediction_method, {}).get(
                    'query_max_len_diff', query_max_len_diff),
                pkey=prediction_method,
                sha1val=nh_str,
            )

            structures = BA_support.rebuild_structures_output_from_pred(
                seqs2predict_list, structures_t)

            del structures_t
            return structures, exec_time, []

    except exceptions.NoHomologousSequenceException:
        msg = nonhomseqwarn(prediction_method)
        return None, None, [msg]
    except exceptions.AmbiguousQuerySequenceException:
        msgfail = "Query sequence contains ambiguous characters. Can't use {}.".format(
            prediction_method)
        ml.warning(msgfail)
        return None, None, [msgfail]
    except exceptions.SubprocessException as e:
        msg = "{} can't be used. Error message follows: {} \n{}".format(
            prediction_method, str(e), e.errors)
        ml.error(msg)
        return None, None, [str(e)]
    except Exception as e:
        ml.error("{} can't be used. Error message follows: \n{}.".format(
            prediction_method, str(e)))
        return None, None, [str(e)]

    assert False, "Should not reach here (bad prediction method name)."
示例#8
0
def wrapped_ending_with_prediction(
    args_inner,
    analyzed_hits,
    pred_method=None,
    method_params=None,
    used_cm_file=None,
    multi_query=False,
    iteration=0,
):
    """
    wrapper for prediction of secondary structures
    :param args_inner: Namespace of input arguments
    :param analyzed_hits: BlastSearchRecompute object
    :param pred_method:
    :param method_params:
    :param used_cm_file: cmfile if cmfile is known (user given or computed)
    :return:
    """
    ml.debug(fname())
    exec_time = {}
    msg = 'Entering structure prediction..'
    if ml.level < 21:
        ml.info(msg)
    else:
        print(msg)
        ml.info(msg)

    if pred_method is None:
        pred_method = args_inner.prediction_method

    if isinstance(pred_method, str):
        pred_method = (pred_method, )

    if method_params is None:
        method_params = args_inner.pred_params

    # ======= filter if needed =======
    # do the filtering based on e-val or bitscore
    # homologous hits still gets used for prediction

    # annotate ambiguous bases
    query = BA_support.annotate_ambiguos_base(analyzed_hits.query)

    # copy the list before filtering
    all_hits_list = [i.extension for i in analyzed_hits.get_all_hits()]

    if args_inner.filter_by_eval is not None:
        hits2predict = filter_by_eval(analyzed_hits.get_all_hits(),
                                      BA_support.blast_hit_getter_from_subseq,
                                      args_inner.filter_by_eval)
        _hits = HitList()
        for h in hits2predict:
            _hits.append(h)
        analyzed_hits.hits = _hits
    elif args_inner.filter_by_bitscore is not None:
        hits2predict = filter_by_bits(analyzed_hits.get_all_hits(),
                                      BA_support.blast_hit_getter_from_subseq,
                                      args_inner.filter_by_bitscore)
        _hits = HitList()
        for h in hits2predict:
            _hits.append(h)
        analyzed_hits.hits = _hits
    else:
        analyzed_hits.hits = analyzed_hits.get_all_hits()

    # if used_cm_file is provided do not override it with CM from RFAM
    # if use_rfam flag was given, then used_cm_file is already the best_matching model
    # if analyzed_hits.best_matching_model is None - then we could not find the best matching model in RFAM
    #  and the rfam based methods should fail (i.e. not predict anything)
    delete_cm = False
    if used_cm_file is None and analyzed_hits.best_matching_model is not None:
        rfam = RfamInfo()
        used_cm_file = run_cmfetch(
            rfam.file_path, analyzed_hits.best_matching_model['target_name'])
        delete_cm = True

    fd, seqs2predict_fasta = mkstemp(prefix='rba_',
                                     suffix='_83',
                                     dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as fah:
        for hit in analyzed_hits.hits:
            if len(hit.extension.seq) == 0:
                continue
            fah.write('>{}\n{}\n'.format(hit.extension.id,
                                         str(hit.extension.seq)))

    if not isinstance(method_params, dict):
        raise Exception('prediction method parameters must be python dict')

    # prediction methods present in analyzed_hits
    #  which might be loaded from intermediate file

    # check if structures of a method are predicted for all required hit
    # check if the prediction parameters of such method were same

    prediction_msgs = []
    # compute prediction methods which were not computed
    for pkey in set(pred_method):
        # add sha1 hashes
        nh = sha1()
        nh.update(str(sorted(method_params.get(pkey, {}).items())).encode())
        current_hash = nh.hexdigest()

        if all(pkey in h.extension.letter_annotations for h in analyzed_hits.hits) and \
                len(
                    {
                        h.extension.annotations.get('sha1', {}).get(pkey, None) for h in analyzed_hits.hits
                    } | {current_hash, }
                ) == 1:
            msg_skip = 'All structures already computed for {}. Skipping...'.format(
                pkey)
            ml.info(msg_skip)
            if ml.level > 20:
                print(msg_skip, flush=True)
            continue

        msg_run = 'Running: {}...'.format(pkey)
        ml.info(msg_run)

        if ml.level > 20:
            print(msg_run, flush=True)

        structures, etime, msgs = repredict_structures_for_homol_seqs(
            query,
            seqs2predict_fasta,
            args_inner.threads,
            prediction_method=pkey,
            pred_method_params=method_params,
            all_hits_list=all_hits_list,
            seqs2predict_list=[i.extension for i in analyzed_hits.hits],
            use_cm_file=used_cm_file,
        )

        exec_time[pkey] = etime

        if structures is None:
            msg = 'Structures not predicted with {} method'.format(pkey)
            ml.info(msg)
            if ml.level > 20:
                print('STATUS: ' + msg)

        else:
            for i, hit in enumerate(analyzed_hits.hits):
                assert str(hit.extension.seq) == str(structures[i].seq)
                hit.extension.annotations['sss'] += [pkey]

                hit.extension.annotations['msgs'] += structures[
                    i].annotations.get('msgs', [])

                # expects "predicted" in annotations - for now, if not given, default is True, as not all prediction
                #  methods implement "predicted" in their output
                if structures[i].annotations.get('predicted', True):
                    hit.extension.letter_annotations[pkey] = structures[
                        i].letter_annotations['ss0']

                if 'sha1' not in hit.extension.annotations:
                    hit.extension.annotations['sha1'] = dict()
                hit.extension.annotations['sha1'][pkey] = current_hash

                try:
                    del hit.extension.letter_annotations['ss0']
                except KeyError:
                    pass
                try:
                    hit.extension.annotations['sss'].remove('ss0')
                except ValueError:
                    pass

            analyzed_hits.update_hit_stuctures()

        # check if msgs are not empty
        if msgs:
            prediction_msgs.append('{}: {}'.format(pkey, '\n'.join(msgs)))

        analyzed_hits.msgs = prediction_msgs

        with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                  'r+') as f:
            all_saved_data = json.load(f)
            all_saved_data[iteration] = blastsearchrecompute2dict(
                analyzed_hits)
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f, indent=2)

    # remove structures predicted by different methods (which might be saved from previous computation)
    for hit in analyzed_hits.hits:
        for pkey in set(hit.extension.letter_annotations.keys()):
            if pkey not in pred_method:
                del hit.extension.letter_annotations[pkey]
                try:
                    hit.extension.annotations['sss'].remove(pkey)
                except ValueError:
                    pass

    BA_support.remove_one_file_with_try(seqs2predict_fasta)

    if delete_cm:
        BA_support.remove_one_file_with_try(used_cm_file)

    add_loc_to_description(analyzed_hits)

    # write html if requested
    if args_inner.html:
        html_file = iter2file_name(args_inner.html, multi_query, iteration)
        ml.info('Writing html to {}.'.format(html_file))
        with open(html_file, 'wb') as h:
            h.write(write_html_output(analyzed_hits))

    # write csv file if requested
    if args_inner.csv:
        csv_file = iter2file_name(args_inner.csv, multi_query, iteration)
        ml.info('Writing csv to {}.'.format(csv_file))
        analyzed_hits.to_csv(csv_file)

    # replace with json
    if args_inner.json:
        json_file = iter2file_name(args_inner.json, multi_query, iteration)
        ml.info('Writing json to {}.'.format(json_file))
        j_obj = json.dumps(blastsearchrecompute2dict(analyzed_hits), indent=2)
        if getattr(args_inner, 'zip_json', False):
            with open(json_file + '.gz', 'wb') as ff:
                ff.write(gzip.compress(j_obj.encode()))
        else:
            with open(json_file, 'w') as ff:
                ff.write(j_obj)

    if args_inner.pandas_dump:
        pickle_file = iter2file_name(args_inner.pandas_dump, multi_query,
                                     iteration)
        ml.info('Writing pandas pickle to {}.'.format(pickle_file))
        pandas.to_pickle(analyzed_hits.pandas, pickle_file)

    if args_inner.dump:
        dump_file = iter2file_name(args_inner.dump, multi_query, iteration)
        ml.info('Writing dump files base: {}.'.format(dump_file))
        with open(dump_file, 'wb') as pp:
            pickle.dump(analyzed_hits, pp, pickle.HIGHEST_PROTOCOL)

        with open(dump_file + '.time_dump', 'wb') as pp:
            pickle.dump(exec_time, pp, pickle.HIGHEST_PROTOCOL)

    return analyzed_hits
示例#9
0
def lunch_computation(args_inner, shared_list=None):
    ml.debug(fname())
    if not shared_list:
        shared_list = []

    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type)
    query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')]

    if len(p_blast) != len(query_seqs):
        ml.error(
            'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences'
            ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs)))
        sys.exit(1)

    # check if BLAST does not contain unexpected sequence characters
    validate_args.check_blast(p_blast)

    # create list of correct length if needed
    all_saved_data = [None] * len(query_seqs)
    saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10])
    with open(saved_file, 'r+') as f:
        _saved = json.load(f)
        if _saved is None:
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f)
        else:
            msg = "Loading backup data."
            print('STATUS: ' + msg)
            ml.info(msg + ' file: ' + saved_file)
            all_saved_data = _saved

            for saved_data in all_saved_data:
                # we can have partially computed data
                if saved_data is None:
                    continue
                if saved_data['args']['sha1'] != args_inner.sha1:
                    msg = "Input argument hash does not match the saved argument hash. "
                    if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]:
                        msg += "This is because of truncating hashes to first 10 characters. "
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        ml.error(msg)
                        sys.exit(1)
                    else:
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        sys.exit(1)

    if len(p_blast) > 1:
        multi_query = True
    else:
        multi_query = False

    # this is done for each query
    ml_out_line = []
    all_analyzed = []
    for iteration, (bhp, query, saved_data) in enumerate(
            zip(p_blast, query_seqs, all_saved_data)):
        if saved_data is None:
            print('STATUS: processing query: {}'.format(query.id))
            validate_args.verify_query_blast(blast=bhp, query=query)

            analyzed_hits = BlastSearchRecompute(args_inner, query, iteration)
            analyzed_hits.multi_query = multi_query

            # run cm model build
            # allows to fail fast if rfam was selected and we dont find the model
            ih_model, analyzed_hits = find_and_extract_cm_model(
                args_inner, analyzed_hits)

            # select all
            all_blast_hits = BA_support.blast_hsps2list(bhp)

            if len(all_blast_hits) == 0:
                ml.error('No hits found in {} - {}. Nothing to do.'.format(
                    args_inner.blast_in, bhp.query))
                continue

            # filter if needed
            if args_inner.filter_by_eval is not None:
                tmp = filter_by_eval(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_eval)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue
            elif args_inner.filter_by_bitscore is not None:
                tmp = filter_by_bits(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_bitscore)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue

            all_short = all_blast_hits

            # now this is different for each mode
            if args_inner.mode == 'simple':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'locarna':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'meta':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            else:
                raise ValueError(
                    'Unknown option - should be cached by argparse.')

            if len(analyzed_hits.hits) == 0:
                ml.error(
                    "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'."
                )
                sys.exit(1)

            analyzed_hits.copy_hits()

            with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                      'r+') as f:
                all_saved_data = json.load(f)
                all_saved_data[iteration] = blastsearchrecompute2dict(
                    analyzed_hits)
                f.seek(0)
                f.truncate()
                json.dump(all_saved_data, f, indent=2)

        else:
            print(
                'STATUS: extended sequences loaded from backup file for query {}'
                .format(query.id))
            analyzed_hits = blastsearchrecomputefromdict(saved_data)

            # overwrite the saved args with current
            # this will update used prediction methods and other non essential stuff
            analyzed_hits.args = args_inner

            if analyzed_hits.args.cm_file:
                cm_file_rfam_user = analyzed_hits.args.cm_file
            else:
                cm_file_rfam_user = None

        all_analyzed.append(analyzed_hits)

        # write all hits to fasta
        fda, all_hits_fasta = mkstemp(prefix='rba_',
                                      suffix='_22',
                                      dir=CONFIG.tmpdir)
        os.close(fda)
        analyzed_hits.write_results_fasta(all_hits_fasta)

        out_line = []
        # multiple prediction params
        if args_inner.dev_pred:
            dp_list = []
            # acomodate more dev pred outputs
            dpfile = None
            if getattr(args_inner, 'dump', False):
                dpfile = args_inner.dump.strip('dump')
            if getattr(args_inner, 'pandas_dump', False):
                dpfile = args_inner.pandas_dump.strip('pandas_dump')
            if getattr(args_inner, 'json', False):
                dpfile = args_inner.json.strip('json')

            # optimization so the rfam cm file is used only once
            if cm_file_rfam_user is None and 'rfam' in ''.join(
                    args_inner.prediction_method):
                best_model = get_cm_model(args_inner.blast_query,
                                          threads=args_inner.threads)
                rfam = RfamInfo()
                cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model)

            for method in args_inner.prediction_method:
                # cycle the prediction method settings
                # get set of params for each preditcion
                selected_pred_params = [
                    kk for kk in args_inner.pred_params if method in kk
                ]
                shuffle(selected_pred_params)
                # for method_params in args_inner.pred_params:
                for i, method_params in enumerate(selected_pred_params):
                    ah = deepcopy(analyzed_hits)

                    random_flag = BA_support.generate_random_name(
                        8, shared_list)
                    shared_list.append(random_flag)

                    pname = re.sub(' ', '', str(method))
                    flag = '|pred_params|' + random_flag

                    # rebuild the args only with actualy used prediction settings
                    ah.args.prediction_method = method
                    ah.args.pred_params = method_params

                    if getattr(args_inner, 'dump', False):
                        spa = args_inner.dump.split('.')
                        ah.args.dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pandas_dump', False):
                        spa = args_inner.pandas_dump.split('.')
                        ah.args.pandas_dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pdf_out', False):
                        spa = args_inner.pdf_out.split('.')
                        ah.args.pdf_out = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'json', False):
                        spa = args_inner.json.split('.')
                        ah.args.json = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]

                    wrapped_ending_with_prediction(
                        args_inner=ah.args,
                        analyzed_hits=ah,
                        pred_method=method,
                        method_params=method_params,
                        used_cm_file=cm_file_rfam_user,
                        multi_query=multi_query,
                        iteration=iteration,
                    )
                    success = True
                    out_line.append(to_tab_delim_line_simple(ah.args))

                    dp_list.append((i, method_params, success, flag, pname,
                                    random_flag, args_inner.pred_params))

            if dpfile is not None:
                with open(dpfile + 'devPredRep', 'wb') as devf:
                    pickle.dump(dp_list, devf)
        else:
            wrapped_ending_with_prediction(
                args_inner=args_inner,
                analyzed_hits=analyzed_hits,
                used_cm_file=cm_file_rfam_user,
                multi_query=multi_query,
                iteration=iteration,
            )
            out_line.append(to_tab_delim_line_simple(args_inner))

        ml_out_line.append('\n'.join(out_line))

        if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user):
            BA_support.remove_one_file_with_try(cm_file_rfam_user)

        BA_support.remove_one_file_with_try(all_hits_fasta)
    return '\n'.join(ml_out_line), all_analyzed