Exemplo n.º 1
0
    def setUp(self):
        self.args = Pseudoargs(
            blast_query,
            blast_in,
            blast_db,
            b_type='plain',
            prediction_method=[
                'rnafold',
            ],
            blast_regexp=r'(?<=\|)[A-Z0-9]*\.?\d*$',
            enable_overwrite=True,
            mode='simple',
            html=test_output_file,
        )

        ff, csv = tempfile.mkstemp(prefix='rba_', suffix='_t1')
        os.close(ff)
        self.csv = csv

        ff, json_file = tempfile.mkstemp(prefix='rba_', suffix='_t4')
        os.close(ff)
        self.json = json_file

        ff, fasta = tempfile.mkstemp(prefix='rba_', suffix='_t5')
        os.close(ff)
        self.fasta = fasta

        ff, fasta_structures = tempfile.mkstemp(prefix='rba_', suffix='_t6')
        os.close(ff)
        self.fasta_structures = fasta_structures

        tp = tools_paths(
            os.path.join(os.path.dirname(os.path.dirname(__file__)),
                         'rna_blast_analyze', 'BR_core', 'config.txt'))

        CONFIG.override(tp)

        rfam = RfamInfo()
        self.sha1 = compute_args_hash(
            self.args, os.path.join(CONFIG.rfam_dir, rfam.gzname))
        self.test_backup_file = blast_in + '.r-' + self.sha1[:10]
Exemplo n.º 2
0
def main():
    try:
        # outer envelope for the script
        # ========= perform argument parsing =========
        if download_name in sys.argv and not ('-q' in sys.argv
                                              or '--blast_query' in sys.argv):
            # run download rfam here
            # do not run, if given with normal run request
            from rna_blast_analyze.BR_core.config import tools_paths, CONFIG
            from rna_blast_analyze.BR_core import cmalign
            if cfg_name in sys.argv:
                CONFIG.override(
                    tools_paths(config_file=sys.argv[sys.argv.index(cfg_name) +
                                                     1]))
            cmalign.download_cmmodels_file()
            # rfam database downloaded
            sys.exit(0)

        args = f_parser()

        _ = lunch_with_args(args)

        # if we reach here, exit with 0
        sys.exit(0)
    except Exception as e:
        print('Something went wrong.')
        try:
            import traceback
            print(
                'The error traceback is written to rboAnalyzer.log . '
                'Please send it along with the query file and BLAST input to the developers.'
            )

            with open('rboAnalyzer.log', 'w') as fd:
                fd.write(str(e))
                fd.write(traceback.format_exc())
        except Exception:
            pass
        sys.exit(1)
def extend_meta_core(analyzed_hits,
                     query,
                     args_inner,
                     all_short,
                     multi_query,
                     iteration,
                     ih_model,
                     timeout=None):
    ml.debug(fname())
    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    blast_args = deepcopy(args_inner)
    locarna_args = deepcopy(args_inner)
    b_all_short = deepcopy(all_short)
    l_all_short = deepcopy(all_short)

    if args_inner.repredict_file is None:
        fd, repred_file = mkstemp(prefix='rba_',
                                  suffix='_18',
                                  dir=CONFIG.tmpdir)
        os.close(fd)
    else:
        repred_file = args_inner.repredict_file

    for i, args in enumerate([blast_args, locarna_args]):
        args.prediction_method = []
        args.pred_params = dict()
        args.dump = None
        args.pdf_out = None
        args.pandas_dump = None
        args.repredict_file = repred_file + str(i)
        args.dev_pred = False
        args.logfile = None
        args.json = None
        args.html = None
        args.cm_file = ih_model

    analyzed_hits_simple = deepcopy(analyzed_hits)
    analyzed_hits_locarna = deepcopy(analyzed_hits)

    analyzed_hits_simple, _, _, _ = extend_simple_core(analyzed_hits_simple,
                                                       query, blast_args,
                                                       b_all_short,
                                                       multi_query, iteration,
                                                       ih_model)
    analyzed_hits_locarna, _, _, _ = extend_locarna_core(analyzed_hits_locarna,
                                                         query,
                                                         locarna_args,
                                                         l_all_short,
                                                         multi_query,
                                                         iteration,
                                                         ih_model,
                                                         timeout=timeout)

    # add cmstat to query
    analyzed_hits.query = analyzed_hits_simple.query

    order_out = []

    b_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_simple.hits}
    l_dict = {BA_support.get_hit_n(h): h for h in analyzed_hits_locarna.hits}
    ok_keys = sorted(set(b_dict.keys()) | set(l_dict.keys()))
    for inum in ok_keys:
        bh = b_dict.get(inum, None)
        lh = l_dict.get(inum, None)

        hits = [bh, lh]
        # fallback to simple if locarna returned empty hit
        # deal with the situation when both ways returned empty hits
        filtered_hits = [h for h in hits if h is not None]
        if len(filtered_hits) == 1:
            msg = 'Only one extension method completed successfully for {}. ' \
                  'Choosing the successfully extended sequence to the output.'.format(filtered_hits[0].extension.id)
            ml.info(msg)
            if ml.getEffectiveLevel() < 20:
                print(msg)
            analyzed_hits.hits.append(filtered_hits[0])
            continue
        elif len(filtered_hits) == 0:
            # append empty extension
            analyzed_hits.hits_failed.append(lh)
            continue

        bit_scores = [
            i.extension.annotations['cmstat']['bit_sc'] for i in hits
        ]

        mb = max(bit_scores)
        bit_index = [i for i, j in enumerate(bit_scores) if j == mb][0]
        order_out.append(bit_index)

        analyzed_hits.hits.append(hits[bit_index])

    # build failed hits
    b_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_simple.hits_failed
    }
    l_dict_failed = {
        BA_support.get_hit_n(h): h
        for h in analyzed_hits_locarna.hits_failed
    }
    for inum in sorted(set(b_dict_failed) | set(l_dict_failed)):
        if inum not in ok_keys:
            if inum in b_dict_failed:
                analyzed_hits.hits_failed.append(b_dict_failed[inum])
            elif inum in l_dict_failed:
                analyzed_hits.hits_failed.append(l_dict_failed[inum])
            else:
                raise KeyError(
                    "Failed to find inum key in failed extensions. This should not happen."
                )

    # build the repredict file here if needed
    if args_inner.repredict_file:
        b_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        l_repredict = BA_support.iter2file_name(blast_args.repredict_file,
                                                multi_query, iteration)
        o_repredict = BA_support.iter2file_name(args_inner.repredict_file,
                                                multi_query, iteration)
        with open(b_repredict,
                  'r') as barf, open(l_repredict,
                                     'r') as larf, open(o_repredict,
                                                        'w') as reprf:
            """
            please note that order of files to merge must be same as the order of methods in previous for cycle
            ie same as the one in which order_out var is set
            """
            bb = (barf, larf)

            fl = bb[0].readline()
            reprf.write(fl)
            fl = bb[0].readline()
            reprf.write(fl)
            # dump first line of the other documents
            [[i.readline() for _ in range(1)] for i in bb[1:]]

            for o in order_out:
                lll = [i.readline() for i in bb]
                reprf.write(lll[o])

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # recreate needed data from selected hits
    homology_prediction = []
    homol_seqs = []
    for hit in analyzed_hits.hits:
        homology_prediction.append(hit.hpred)
        if hit.hpred:
            homol_seqs.append(hit.extension)

        # add default prediction if it is not present
        if 'ss0' not in hit.extension.letter_annotations:
            if 'sss' not in hit.extension.annotations:
                hit.extension.anotations['sss'] = []
            hit.extension.annotations['sss'] += ['ss0']
            hit.extension.letter_annotations['ss0'] = '.' * len(
                hit.extension.seq)

    # remove description from hits and sources
    for hit in analyzed_hits.hits:
        hit.extension.description = ''

    if args_inner.cm_file or args_inner.use_rfam:
        cm_file_rfam_user = ih_model
    else:
        cm_file_rfam_user = None
        BA_support.remove_one_file_with_try(ih_model)
    return analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user
Exemplo n.º 4
0
def lunch_with_args(args):
    # ========= imports ==========
    # move slow imports here, so the argcomplete would be fast
    import logging
    from rna_blast_analyze.BR_core import BA_verify
    from rna_blast_analyze.BR_core import cmalign
    from rna_blast_analyze.BR_core.config import tools_paths, CONFIG
    from rna_blast_analyze.BR_core.validate_args import validate_args, compute_args_hash
    from rna_blast_analyze.BR_core.luncher import lunch_computation
    from rna_blast_analyze.BR_core.convert_classes import blastsearchrecompute2dict
    from rna_blast_analyze.BR_core.cmalign import RfamInfo

    logger = logging.getLogger('rboAnalyzer')

    ch = logging.StreamHandler()
    formatter = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
    ch.setFormatter(formatter)

    logger.addHandler(ch)

    # set logger level
    logger.setLevel(max(3 - args.verbose, 1) * 10)

    logger.debug('parsed arguments: {}'.format(args))

    # create logging file if requested
    if args.logfile:
        fh = logging.FileHandler(args.logfile)
        formatter = logging.Formatter(
            '%(asctime)s - %(levelname)s - %(message)s')
        fh.setFormatter(formatter)
        logger.addHandler(fh)

    start_msg = 'STATUS: starting rboAnalyzer...'
    print(start_msg)
    logger.info(start_msg)

    logger.info('BLAST file: {}'.format(args.blast_in))
    logger.info('Query file: {}'.format(args.blast_query))
    logger.info('BLAST db:   {}'.format(args.blast_db))
    if args.config_file:
        logger.info('configfile: {}'.format(args.config_file))

    # ========= load optional cfg file =========
    CONFIG.override(tools_paths(config_file=args.config_file))

    # ========= check rfam =========
    if not args.download_rfam and not cmalign.check_rfam_present():
        msgfail = 'RFAM models file is not present in specified path. ' \
                  'Please enable rfam download (--download_rfam) or provide prepared directory.'
        logger.error(msgfail)
        sys.exit(1)

    if args.download_rfam:
        cmalign.download_cmmodels_file()

    # ========= check if tools needed for requested methods are installed =========
    BA_verify.check_necessery_tools(methods=args.prediction_method +
                                    [args.mode])

    # ========= check if parameters make sense =========
    if not validate_args(args):
        print(
            "There was an error with provided arguments. Please see the error message."
        )
        sys.exit(1)

    # ========= compute args hash =========
    rfam = RfamInfo()
    hashstring = compute_args_hash(args,
                                   os.path.join(rfam.rfam_dir, rfam.gzname))
    setattr(args, 'sha1', hashstring)

    # ========= run =========
    blast_fn = os.path.basename(args.blast_in) + '.r-' + hashstring[:10]
    blast_dir = os.path.dirname(args.blast_in)
    if blast_dir == '':
        blast_dir = os.getcwd()
    potential_matches = [f for f in os.listdir(blast_dir) if f == blast_fn]
    if len(potential_matches) == 0:
        with open(args.blast_in + '.r-' + args.sha1[:10], 'w') as f:
            json.dump(None, f)

    _, results = lunch_computation(args)
    with open(os.path.join(blast_dir, blast_fn), 'w') as f:
        json.dump([blastsearchrecompute2dict(r) for r in results], f, indent=2)

    return results
Exemplo n.º 5
0
 def test_config_override(self):
     rfam_dir = '/test/test/Documents/rfamdb/'
     self.assertNotEqual(CONFIG.rfam_dir, rfam_dir)
     CONFIG.override(
         tools_paths(os.path.join(fwd, 'test_data', 'config_test.txt')))
     self.assertEqual(CONFIG.rfam_dir, rfam_dir)
Exemplo n.º 6
0
def lunch_computation(args_inner, shared_list=None):
    ml.debug(fname())
    if not shared_list:
        shared_list = []

    # update params if different config is requested
    CONFIG.override(tools_paths(args_inner.config_file))

    p_blast = BA_support.blast_in(args_inner.blast_in, b=args_inner.b_type)
    query_seqs = [i for i in SeqIO.parse(args_inner.blast_query, 'fasta')]

    if len(p_blast) != len(query_seqs):
        ml.error(
            'Number of query sequences in provided BLAST output file ({}) does not match number of query sequences'
            ' in query FASTA file ({}).'.format(len(p_blast), len(query_seqs)))
        sys.exit(1)

    # check if BLAST does not contain unexpected sequence characters
    validate_args.check_blast(p_blast)

    # create list of correct length if needed
    all_saved_data = [None] * len(query_seqs)
    saved_file = '{}.r-{}'.format(args_inner.blast_in, args_inner.sha1[:10])
    with open(saved_file, 'r+') as f:
        _saved = json.load(f)
        if _saved is None:
            f.seek(0)
            f.truncate()
            json.dump(all_saved_data, f)
        else:
            msg = "Loading backup data."
            print('STATUS: ' + msg)
            ml.info(msg + ' file: ' + saved_file)
            all_saved_data = _saved

            for saved_data in all_saved_data:
                # we can have partially computed data
                if saved_data is None:
                    continue
                if saved_data['args']['sha1'] != args_inner.sha1:
                    msg = "Input argument hash does not match the saved argument hash. "
                    if saved_data['args']['sha1'][:10] == args_inner.sha1[:10]:
                        msg += "This is because of truncating hashes to first 10 characters. "
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        ml.error(msg)
                        sys.exit(1)
                    else:
                        msg += "Please remove the '{}' file.".format(
                            saved_file)
                        sys.exit(1)

    if len(p_blast) > 1:
        multi_query = True
    else:
        multi_query = False

    # this is done for each query
    ml_out_line = []
    all_analyzed = []
    for iteration, (bhp, query, saved_data) in enumerate(
            zip(p_blast, query_seqs, all_saved_data)):
        if saved_data is None:
            print('STATUS: processing query: {}'.format(query.id))
            validate_args.verify_query_blast(blast=bhp, query=query)

            analyzed_hits = BlastSearchRecompute(args_inner, query, iteration)
            analyzed_hits.multi_query = multi_query

            # run cm model build
            # allows to fail fast if rfam was selected and we dont find the model
            ih_model, analyzed_hits = find_and_extract_cm_model(
                args_inner, analyzed_hits)

            # select all
            all_blast_hits = BA_support.blast_hsps2list(bhp)

            if len(all_blast_hits) == 0:
                ml.error('No hits found in {} - {}. Nothing to do.'.format(
                    args_inner.blast_in, bhp.query))
                continue

            # filter if needed
            if args_inner.filter_by_eval is not None:
                tmp = filter_by_eval(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_eval)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue
            elif args_inner.filter_by_bitscore is not None:
                tmp = filter_by_bits(all_blast_hits,
                                     BA_support.blast_hit_getter_from_hits,
                                     args_inner.filter_by_bitscore)
                if len(tmp) == 0 and len(all_blast_hits) != 0:
                    ml.error(
                        'The requested filter removed all BLAST hits {} - {}. Nothing to do.'
                        .format(args_inner.blast_in, bhp.query))
                    continue

            all_short = all_blast_hits

            # now this is different for each mode
            if args_inner.mode == 'simple':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_simple_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'locarna':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_locarna_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            elif args_inner.mode == 'meta':
                analyzed_hits, homology_prediction, homol_seqs, cm_file_rfam_user = extend_meta_core(
                    analyzed_hits, query, args_inner, all_short, multi_query,
                    iteration, ih_model)
            else:
                raise ValueError(
                    'Unknown option - should be cached by argparse.')

            if len(analyzed_hits.hits) == 0:
                ml.error(
                    "Extension failed for all sequences. Please see the error message. You can also try '--mode simple'."
                )
                sys.exit(1)

            analyzed_hits.copy_hits()

            with open(args_inner.blast_in + '.r-' + args_inner.sha1[:10],
                      'r+') as f:
                all_saved_data = json.load(f)
                all_saved_data[iteration] = blastsearchrecompute2dict(
                    analyzed_hits)
                f.seek(0)
                f.truncate()
                json.dump(all_saved_data, f, indent=2)

        else:
            print(
                'STATUS: extended sequences loaded from backup file for query {}'
                .format(query.id))
            analyzed_hits = blastsearchrecomputefromdict(saved_data)

            # overwrite the saved args with current
            # this will update used prediction methods and other non essential stuff
            analyzed_hits.args = args_inner

            if analyzed_hits.args.cm_file:
                cm_file_rfam_user = analyzed_hits.args.cm_file
            else:
                cm_file_rfam_user = None

        all_analyzed.append(analyzed_hits)

        # write all hits to fasta
        fda, all_hits_fasta = mkstemp(prefix='rba_',
                                      suffix='_22',
                                      dir=CONFIG.tmpdir)
        os.close(fda)
        analyzed_hits.write_results_fasta(all_hits_fasta)

        out_line = []
        # multiple prediction params
        if args_inner.dev_pred:
            dp_list = []
            # acomodate more dev pred outputs
            dpfile = None
            if getattr(args_inner, 'dump', False):
                dpfile = args_inner.dump.strip('dump')
            if getattr(args_inner, 'pandas_dump', False):
                dpfile = args_inner.pandas_dump.strip('pandas_dump')
            if getattr(args_inner, 'json', False):
                dpfile = args_inner.json.strip('json')

            # optimization so the rfam cm file is used only once
            if cm_file_rfam_user is None and 'rfam' in ''.join(
                    args_inner.prediction_method):
                best_model = get_cm_model(args_inner.blast_query,
                                          threads=args_inner.threads)
                rfam = RfamInfo()
                cm_file_rfam_user = run_cmfetch(rfam.file_path, best_model)

            for method in args_inner.prediction_method:
                # cycle the prediction method settings
                # get set of params for each preditcion
                selected_pred_params = [
                    kk for kk in args_inner.pred_params if method in kk
                ]
                shuffle(selected_pred_params)
                # for method_params in args_inner.pred_params:
                for i, method_params in enumerate(selected_pred_params):
                    ah = deepcopy(analyzed_hits)

                    random_flag = BA_support.generate_random_name(
                        8, shared_list)
                    shared_list.append(random_flag)

                    pname = re.sub(' ', '', str(method))
                    flag = '|pred_params|' + random_flag

                    # rebuild the args only with actualy used prediction settings
                    ah.args.prediction_method = method
                    ah.args.pred_params = method_params

                    if getattr(args_inner, 'dump', False):
                        spa = args_inner.dump.split('.')
                        ah.args.dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pandas_dump', False):
                        spa = args_inner.pandas_dump.split('.')
                        ah.args.pandas_dump = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'pdf_out', False):
                        spa = args_inner.pdf_out.split('.')
                        ah.args.pdf_out = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]
                    if getattr(args_inner, 'json', False):
                        spa = args_inner.json.split('.')
                        ah.args.json = '.'.join(
                            spa[:-1]) + flag + '.' + spa[-1]

                    wrapped_ending_with_prediction(
                        args_inner=ah.args,
                        analyzed_hits=ah,
                        pred_method=method,
                        method_params=method_params,
                        used_cm_file=cm_file_rfam_user,
                        multi_query=multi_query,
                        iteration=iteration,
                    )
                    success = True
                    out_line.append(to_tab_delim_line_simple(ah.args))

                    dp_list.append((i, method_params, success, flag, pname,
                                    random_flag, args_inner.pred_params))

            if dpfile is not None:
                with open(dpfile + 'devPredRep', 'wb') as devf:
                    pickle.dump(dp_list, devf)
        else:
            wrapped_ending_with_prediction(
                args_inner=args_inner,
                analyzed_hits=analyzed_hits,
                used_cm_file=cm_file_rfam_user,
                multi_query=multi_query,
                iteration=iteration,
            )
            out_line.append(to_tab_delim_line_simple(args_inner))

        ml_out_line.append('\n'.join(out_line))

        if cm_file_rfam_user is not None and os.path.exists(cm_file_rfam_user):
            BA_support.remove_one_file_with_try(cm_file_rfam_user)

        BA_support.remove_one_file_with_try(all_hits_fasta)
    return '\n'.join(ml_out_line), all_analyzed