Exemplo n.º 1
0
def compute_alifold(msa_file, alifold_params=''):
    ml.info('Running RNAalifold.')
    ml.debug(fname())
    fd, out_path = mkstemp(prefix='rba_', suffix='_02', dir=CONFIG.tmpdir)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp, os.fdopen(
            fd, 'w') as output, open(msa_file, 'r') as inp:
        cmd = [
            '{}RNAalifold'.format(CONFIG.viennarna_path),
            '--noPS',
            '-f',
            'C',
        ] + shlex.split(alifold_params)
        ml.debug(cmd)

        p = subprocess.Popen(cmd,
                             stdin=subprocess.PIPE,
                             stdout=output,
                             stderr=tmp,
                             universal_newlines=True)
        p.communicate(input=inp.read())

        if p.returncode:
            msgfail = 'RNAalifold failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.RNAalifoldException(msgfail, tmp.read())

        return out_path
Exemplo n.º 2
0
def build_stockholm_from_clustal_alig(clustal_file, alif_file):
    """
    build stockholm alignment
    :return:
    """
    ml.debug(fname())
    with open(clustal_file, 'r') as cf, open(alif_file, 'r') as af:
        # write stockholm align to buffer and read it with my parser
        clust = AlignIO.read(cf, format='clustal')
        temp = StringIO()
        AlignIO.write(clust, temp, format='stockholm')
        temp.seek(0)
        st_alig = stockholm_read(temp)

        # parse alifold output and add structure to stockholm alignment
        for i, alif in enumerate(parse_seq_str(af)):
            alifold_structure = alif.letter_annotations['ss0']
            st_alig.column_annotations['SS_cons'] = alifold_structure
            if i == 0:
                break

        st_fd, st_file = mkstemp(prefix='rba_', suffix='_15', dir=CONFIG.tmpdir)
        with os.fdopen(st_fd, 'w') as sf:
            st_alig.write_stockholm(sf)

            return st_file
Exemplo n.º 3
0
def select_sequences_from_similarity_rec(dist_mat: np.ndarray,
                                         sim_threshold_percent=90) -> list:
    """
    :param dist_mat: distmat table, by default obtained from read_clustal_distmat_file, values in percent
    :param sim_threshold_percent: threshold for similarity in percent
    :return:
    """
    ml.debug(fname())
    # dists = np.triu(dist_mat.as_matrix(), 1)          # removes unwanted similarities
    if dist_mat is None:
        return [0]
    dists = dist_mat.transpose()
    # row, col = where(dists > sim_threshold_percent) # determine where the similarities are
    include = set()
    exclude = set()
    a = np.array(range(len(dists)))
    for i, r in enumerate(dists):
        pr = r[~np.isnan(r)]
        pa = a[~np.isnan(r)]
        if (i in exclude) | (any(pr >= sim_threshold_percent)):
            pu = np.where(pr >= sim_threshold_percent)
            u = pa[pu]
            if i not in exclude:
                include |= {i}
            to_ex = set(u.tolist()) - include
            exclude |= to_ex  # union operation
        else:
            include |= {i}

    return sorted(include)
Exemplo n.º 4
0
def run_cmbuild(cmbuild_input_file, cmbuild_params=''):
    """
    run cmbuild procedure
    input must be MSA in stockholm format with secondary structure prediction

    note: consider what to do if only one sequence is available

    :param cmbuild_input_file: Stockholm or selex alignment file
    :param cmbuild_params: additional params to cmbuild
    :return:
    """
    ml.info('Runing cmbuild.')
    ml.debug(fname())
    cm_fd, cm_file = mkstemp(prefix='rba_', suffix='_13', dir=CONFIG.tmpdir)
    os.close(cm_fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = ['{}cmbuild'.format(CONFIG.infernal_path), '-F']
        if cmbuild_params != '':
            cmd += cmbuild_params.split()
        cmd += [cm_file, cmbuild_input_file]
        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            msgfail = 'Call to cmbuild failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.CmbuildException(msgfail, tmp.read())

    return cm_file
Exemplo n.º 5
0
def run_cmalign_on_fasta(fasta_file, model_file, cmalign_params='--notrunc', alig_format='stockholm'):
    """
    run cmalign program with provided CM model file
    :param fasta_file: input fasta to be aligned to cm model
    :param model_file: file containing one or more cm models
    :param cmalign_params: parameter of the search
    :return:
    """
    ml.info('Runing cmaling.')
    ml.debug(fname())
    cma_fd, cma_file = mkstemp(prefix='rba_', suffix='_14', dir=CONFIG.tmpdir)
    os.close(cma_fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = [
            '{}cmalign'.format(CONFIG.infernal_path),
            '--informat', 'fasta',
            '--outformat', alig_format,
        ]
        if cmalign_params != '':
            cmd += cmalign_params.split()
        cmd += ['-o', cma_file, model_file, fasta_file]

        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            msgfail = 'Call to cmalign failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.CmalignException(msgfail, tmp.read())

    return cma_file
Exemplo n.º 6
0
def rebuild_structures_output_from_pred(reference_sequences_list,
                                        predicted_structures_list,
                                        method=None):
    ml.debug(fname())
    structuresids = [
        seq.id for seq in predicted_structures_list if hasattr(seq, 'id')
    ]
    structures_list = []
    for seq in reference_sequences_list:
        nr = SeqRecord(seq.seq,
                       id=seq.id,
                       name=seq.name,
                       description=seq.description,
                       annotations=seq.annotations,
                       letter_annotations=seq.letter_annotations)
        if seq.id in structuresids:
            n = structuresids.index(seq.id)
            nr.letter_annotations.update(
                predicted_structures_list[n].letter_annotations)
            nr.annotations.update(predicted_structures_list[n].annotations)
            nr.annotations['predicted'] = True
        else:
            if method:
                wmsg = '{} failed to predict structure for seq {}.'.format(
                    method, nr.id)
                ml.warning(wmsg)
            nr.annotations['predicted'] = False

        structures_list.append(nr)
        del nr

    return structures_list
Exemplo n.º 7
0
def build_cm_model_rsearch(query_seq, path2selected_sim_array):
    ml.debug(fname())
    query_structure = rna_blast_analyze.BR_core.viennaRNA.RNAfold(
        str(query_seq.seq))[0]

    # remove any annotations from query:
    qs_clean = deepcopy(query_seq)
    qs_clean.annotations = dict()
    qs_clean.letter_annotations = dict()

    # query_structure = RNA.fold(str(analyzed_hits.query.seq))[0]
    # build stockholm like file for use in cm mohdel build
    st_like = StockholmAlig()
    st_like.append(qs_clean)
    st_like.column_annotations['SS_cons'] = query_structure

    fds, stock_file = mkstemp(prefix='rba_', suffix='_30', dir=CONFIG.tmpdir)
    with os.fdopen(fds, 'w') as f:
        st_like.write_stockholm(f)

    # run actual cmbuild
    cm_model_file = run_cmbuild(
        stock_file,
        cmbuild_params='--rsearch {}'.format(path2selected_sim_array))

    # cleanup
    BA_support.remove_one_file_with_try(stock_file)
    return cm_model_file
Exemplo n.º 8
0
def RNAfold(sequence):
    ml.debug(fname())
    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = check_output([
            '{}RNAfold'.format(CONFIG.viennarna_path),
            '--noPS',
        ],
                         input=sequence.encode(),
                         stderr=tmp)

        if isinstance(r, Exception):
            msgfail = 'RNAfold failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.RNAfoldException(msgfail, tmp.read())

        # more robust decode
        out_str = r.decode()
        spl = out_str.split('\n')
        seq = spl[0]
        structure = spl[1][:len(seq)]
        energy = float(spl[1][len(seq) + 2:-1])
        # seq, structure, energy = r.decode().split()
        # return seq, structure, float(energy[1:-1])
        return structure, energy
Exemplo n.º 9
0
def run_cmemit(model, params='', out_file=None):
    """

    :param model:
    :param params:
    :return:
    """
    ml.info('Run cmemit.')
    ml.debug(fname())
    if out_file:
        out = out_file
    else:
        fd, out = mkstemp(prefix='rba_', suffix='_12', dir=CONFIG.tmpdir)
        os.close(fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        # build commandline
        cmd = ['{}cmemit'.format(CONFIG.infernal_path)]
        if params != '':
            cmd += params.split()
        cmd += ['-o', out, model]

        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            msgfail = 'Call to cmemit failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.CmemitException(msgfail, tmp.read(0))
    return out
Exemplo n.º 10
0
def get_cm_model_table(query_file,
                       params=None,
                       threads=None,
                       rfam=None,
                       timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    cmscan_params = '-g '
    if params and ('cmscan' in params) and params['cmscan']:
        cmscan_params += params['cmscan']
    try:
        out_table = run_cmscan(query_file,
                               params=cmscan_params,
                               threads=threads,
                               rfam=rfam,
                               timeout=timeout)
        f = open(out_table, 'r')
        cmscan_data = parse_cmalign_infernal_table(f)
        f.close()
        remove_one_file_with_try(out_table)
        return cmscan_data
    except exceptions.CmscanException as e:
        return None
Exemplo n.º 11
0
def _aligner_block(nr_homolog_hits_file, params, msa_alg, threads=None):
    """
    returns alignment file in clustal format
    :param nr_homolog_hits_file:
    :param params:
    :param msa_alg:
    :param threads: int
    :return:
    """
    ml.debug(fname())
    if msa_alg == 'clustalo':
        clustal_params = '--outfmt=clustal --force'
        clustal_params += params.get('clustalo', '')

        if threads:
            clustal_params += ' --threads={}'.format(threads)
        alig_file = compute_clustalo_clasic(nr_homolog_hits_file,
                                            clustalo_params=clustal_params)

    elif msa_alg == 'muscle':
        if params and ('muscle' in params) and params['muscle']:
            alig_file = run_muscle(nr_homolog_hits_file,
                                   muscle_params=params['muscle'],
                                   reorder=True)
        else:
            alig_file = run_muscle(nr_homolog_hits_file, reorder=True)

    else:
        print('invalig MSA alg chosen {}, valid are "clustalo" and "muscle"'.
              format(msa_alg))
        raise AttributeError()

    return alig_file
Exemplo n.º 12
0
def check_rfam_present():
    """
    Check if RFAM file is present and converted to binary format required by cmscan
     by running program cmpres.
    If present but not converted, conversion is attempted.

    :return: bool
    """
    ml.debug(fname())
    rfam = RfamInfo()
    cm_present = os.path.isfile(rfam.file_path)
    if cm_present:
        if not check_if_cmpress_processed():
            try:
                run_cmpress(rfam.file_path)
            except exceptions.CmpressException as e:
                ml.error(str(e))
                ml.error(
                    'The Rfam file might be corrupt. Please check following output to get more information.\n'
                )
                print(e.errors)
                return False
        return True
    else:
        return False
Exemplo n.º 13
0
def rfam_subopt_pred(all_sequence_fasta, cm_ref_str, params=None, threads=1):
    ml.debug(fname())
    if params is None:
        params = dict()

    if params and ('mfold' in params) and params['mfold']:
        assert isinstance(params['mfold'], (tuple, list)) and 3 == len(params['mfold']), \
            "Incorrect parameters for hybrid_ss_min given. Need tuple of 3 numbers."
        subs = run_hybrid_ss_min(all_sequence_fasta,
                                 mfold=params['mfold'],
                                 threads=threads)
    else:
        subs = run_hybrid_ss_min(all_sequence_fasta, threads=threads)

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(_helper_subopt(seq, cm_ref_str))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, cm_ref_str) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    return new_structures
Exemplo n.º 14
0
def compute_refold(alig_file, cons_file, timeout=None):
    """
    runs refold program
    :param alig_file: MSA alignment file in clustal format
    :param cons_file: file with consensus structure in alifold format
    :return:
    """
    ml.debug(fname())
    fd, out_path = mkstemp(prefix='rba_', suffix='_03', dir=CONFIG.tmpdir)
    cmd = ['{}refold.pl'.format(CONFIG.refold_path), alig_file, cons_file]
    ml.debug(cmd)
    with TemporaryFile(mode='w+',
                       encoding='utf-8') as tmp, os.fdopen(fd, 'w') as output:
        with subprocess.Popen(cmd, stdout=output, stderr=tmp) as p:
            try:
                p.wait(timeout=timeout)
            except subprocess.TimeoutExpired:
                p.kill()
                p.wait()
                raise
            if p.returncode:
                msgfail = 'Call to refold.pl failed.'
                ml.error(msgfail)
                tmp.seek(0)
                raise exceptions.RefoldException(msgfail, tmp.read())

        return out_path
Exemplo n.º 15
0
def create_nr_homolog_hits_file_MSA_unsafe(sim_threshold_percent=None,
                                           all_hits=None,
                                           query=None,
                                           cmscore_tr=0.0,
                                           cm_threshold_percent=None,
                                           len_diff=0.1):
    """
    create non redundant homologous hits file
    """
    ml.debug(fname())
    dist_table, homologous_seqs, msgs = _trusted_hits_selection_wrapper(
        all_hits, query, cmscore_tr, cm_threshold_percent, len_diff_=len_diff)
    if dist_table.size == 0:
        nr_homolog_hits = [query]
    else:
        # normal execution
        to_include = rna_blast_analyze.BR_core.predict_structures.select_sequences_from_similarity_rec(
            dist_table, sim_threshold_percent=sim_threshold_percent)
        nr_homolog_hits = [homologous_seqs[i] for i in to_include]

    fd_h, nr_homo_hits_file = mkstemp(prefix='rba_',
                                      suffix='_59',
                                      dir=CONFIG.tmpdir)
    with os.fdopen(fd_h, 'w') as f:
        SeqIO.write(nr_homolog_hits, f, 'fasta')

    return nr_homo_hits_file, homologous_seqs, msgs
Exemplo n.º 16
0
def extract_ref_from_cm(cm_file):
    ml.debug(fname())
    single_alig_file = run_cmemit(cm_file, params='-a -N 1')
    o = open(single_alig_file, 'r')
    salig = stockholm_read(o)
    o.close()

    remove_one_file_with_try(single_alig_file)

    if len(salig) != 1:
        raise AssertionError('File from cmemit does not have only one record in (not including reference).')

    # recode structure, return it
    ss = salig.column_annotations['SS_cons']
    # inserts = str(salig[0].seq)
    inserts = str(salig.column_annotations['RF'])

    gapchars = '.~'

    structure_list = []
    for i, j in zip(ss, inserts):
        if i in gapchars:
            continue
        structure_list.append(i)

    # recode structure list
    structure = cm_strucutre2br(''.join(structure_list))
    return structure
Exemplo n.º 17
0
def refold_stockholm(stockholm_alig, consensus_structure):
    """
    compute refold.pl from Vienna RNA package
    :param stockholm_alig:
    :param consensus_structure:
    :return:
    """
    ml.debug(fname())
    # convert to clustal alignment
    fd, clust_tempfile = mkstemp(prefix='rba_', suffix='_23', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        stockholm_alig.write_clustal(f)

    # write fake alifold output with given consensus structure
    fd, alif_fake_file = mkstemp(prefix='rba_', suffix='_24', dir=CONFIG.tmpdir)
    with os.fdopen(fd, 'w') as f:
        # the consensus sequence in alifold file is really not used for anything
        f.write('A'*len(consensus_structure) + '\n')
        f.write(consensus_structure + '\n')

    # compute refold
    # refold_path = locate_refold()
    refold_constrained_file = compute_refold(clust_tempfile, alif_fake_file)

    parsed_seqs = []
    with open(refold_constrained_file, 'r') as f:
        # read the file
        for seq in BA_support.parse_seq_str(f):
            parsed_seqs.append(seq)

    # cleanup
    BA_support.remove_files_with_try([clust_tempfile, alif_fake_file, refold_constrained_file])

    return parsed_seqs
Exemplo n.º 18
0
def run_cmfetch(cmfile, modelid, outfile=None):
    """

    :param cmfile:
    :param modelid:
    :return:
    """
    ml.info('Runing cmfetch.')
    ml.debug(fname())
    if outfile:
        out = outfile
    else:
        fd, out = mkstemp(prefix='rba_', suffix='_11', dir=CONFIG.tmpdir)
        os.close(fd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        cmd = [
            '{}cmfetch'.format(CONFIG.infernal_path),
            '-o', out,
            cmfile,
            modelid
        ]
        ml.debug(cmd)
        r = call(cmd, stdout=tmp, stderr=tmp)

        if r:
            msgfail = 'Call to cmfetch failed.'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.CmfetchException(msgfail, tmp.read())
        return out
Exemplo n.º 19
0
def run_rnaplot(seq, structure=None, format='svg', outfile=None, timeout=None):
    """
    run rnaplot in desired format
    if seq
    :param seq:
    :param structure:
    :return:
    """
    ml.debug(fname())
    if structure is None:
        sequence = str(seq.seq)
        structure = seq.letter_annotations['ss0']
    else:
        sequence = seq

    assert len(sequence) == len(structure)

    allowed_formats = {'ps', 'svg', 'gml'}
    if format not in allowed_formats:
        raise TypeError('Format can be only from {}.'.format(allowed_formats))

    currdirr = os.getcwd()
    tmpdir = gettempdir()
    os.chdir(tmpdir)

    cmd = ['{}RNAplot'.format(CONFIG.viennarna_path), '--output-format={}'.format(format)]
    ml.debug(cmd)

    rnaname = generate_random_name(10)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        with subprocess.Popen(cmd, universal_newlines=True, stdin=subprocess.PIPE, stdout=tmp, stderr=tmp) as p:
            try:
                p.communicate(input='>{}\n{}\n{}\n'.format(
                    rnaname,
                    sequence,
                    structure
                ), timeout=timeout)
            except subprocess.TimeoutExpired:
                p.kill()
                p.wait()
                raise

            if p.returncode:
                msgfail = 'Call to RNAplot failed.'
                ml.error(msgfail)
                os.chdir(currdirr)
                tmp.seek(0)
                details = tmp.read()
                ml.debug(details)
                raise exceptions.RNAplotException(msgfail, details)

        plot_output_file = os.path.join(tmpdir, rnaname + '_ss.' + format)
        os.chdir(currdirr)
        if outfile is None:
            return plot_output_file
        else:
            shutil.move(os.path.join(tmpdir, plot_output_file), outfile)
            return outfile
Exemplo n.º 20
0
def subopt_fold_alifold(all_fasta_hits_file,
                        homologs_file,
                        aligner='muscle',
                        params=None,
                        threads=None):
    """
    run clustal/muscle on selected homologs file
    :return:
    """
    ml.debug(fname())
    if params is None:
        params = dict()
    # run aligner
    # =================================================================================================================
    if 'clustalo' == aligner:
        clustal_params = ' --outfmt=clustal --force'
        clustal_params += params.get('clustalo', '')

        if threads:
            clustal_params += ' --threads={}'.format(threads)
        alig_file = compute_clustalo_clasic(homologs_file,
                                            clustalo_params=clustal_params)

    elif 'muscle' == aligner:
        alig_file = run_muscle(homologs_file,
                               muscle_params=params.get('muscle', ''),
                               reorder=False)

    else:
        raise KeyError(
            'provided key ({}) not recognized - avalible: "clustalo" "muscle"'.
            format(aligner))

    # run consensus prediction
    # =================================================================================================================
    alif_file = compute_alifold(alig_file,
                                alifold_params=params.get('alifold', ''))

    # possibly need to decode alifold structure
    alif_str = read_seq_str(alif_file)[0]
    consensus_structure = alif_str.letter_annotations['ss0']

    subs = run_hybrid_ss_min(all_fasta_hits_file,
                             mfold=params.get('mfold', (10, 2, 20)),
                             threads=threads)

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(_helper_subopt(seq, consensus_structure))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, consensus_structure) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    remove_files_with_try([alif_file, alig_file])
    return new_structures
Exemplo n.º 21
0
def infer_hits_cm(bit_sc, tr=0):
    ml.debug(fname())
    pred = []
    for i in bit_sc:
        if i > tr:
            pred.append(True)
        else:
            pred.append(False)
    return pred
Exemplo n.º 22
0
def run_rnaplot(seq, structure=None, format='svg', outfile=None):
    """
    run rnaplot in desired format
    if seq
    :param seq:
    :param structure:
    :return:
    """
    ml.debug(fname())
    if structure is None:
        sequence = str(seq.seq)
        structure = seq.letter_annotations['ss0']
    else:
        sequence = seq

    assert len(sequence) == len(structure)

    allowed_formats = {'ps', 'svg', 'gml', 'xrna'}
    if format not in allowed_formats:
        raise TypeError('Format can be only from {}.'.format(allowed_formats))

    fd, tmpfile = mkstemp(prefix='rba_', suffix='_08', dir=CONFIG.tmpdir)

    rnaname = tmpfile.split('/')[-1].split('\\')[-1]

    currdirr = os.getcwd()
    tmpdir = gettempdir()
    os.chdir(tmpdir)

    with os.fdopen(fd, 'w') as fh:
        fh.write('>{}\n{}\n{}\n'.format(rnaname, sequence, structure))

    cmd = '{} --output-format={} < {}'.format(
        shlex.quote('{}RNAplot'.format(CONFIG.viennarna_path)),
        shlex.quote(format), shlex.quote(tmpfile))
    ml.debug(cmd)
    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = call(cmd, shell=True, stdout=tmp, stderr=tmp)
        if r:
            msgfail = 'Call to RNAplot failed.'
            ml.error(msgfail)
            os.chdir(currdirr)
            tmp.seek(0)
            details = tmp.read()
            ml.debug(details)
            raise exceptions.RNAplotException(msgfail, details)

        # output file is name of the sequence (in this case name of the file) + "_ss." + chosen format
        remove_one_file_with_try(tmpfile)

        plot_output_file = os.path.join(tmpdir, rnaname + '_ss.' + format)
        os.chdir(currdirr)
        if outfile is None:
            return plot_output_file
        else:
            shutil.move(os.path.join(tmpdir, plot_output_file), outfile)
            return outfile
Exemplo n.º 23
0
def download_cmmodels_file(path=None, url=None):
    """
    downloads cm model from rfam database
    default retrieve url is: 'ftp://ftp.ebi.ac.uk/pub/databases/Rfam/CURRENT/Rfam.cm.gz'
    :param path:
    :param url:
    :return:
    """
    print('Running CM download from RFAM.')
    ml.debug(fname())
    rfam = RfamInfo()
    if path is None:
        path = rfam.rfam_dir
    if url is None:
        url = rfam.url

    if not os.path.exists(path):
        os.makedirs(path)

    cmd = ['wget', '-N', '-P', path, url]
    ml.debug(cmd)

    ml.info('Downloading RFAM database (aprox 300Mb). This may take a while...')
    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = call(cmd, stderr=tmp, stdout=tmp)

        if r:
            msgfail = 'Call to wget failed. Please check the internet connection and/or availability of "wget".'
            ml.error(msgfail)
            ml.debug(cmd)
            sys.exit(1)

        tmp.seek(0)
        cmd_output = tmp.read()

        if 'Remote file no newer than local file' in cmd_output:
            # do not download
            msg = 'No new data. Nothing to do.'
            ml.info(msg)
            if ml.getEffectiveLevel() > 20:
                print(msg)
        else:
            # unzip using build in gzip
            with gzip.open(os.path.join(path, rfam.gzname), 'rb') as fin:
                with open(os.path.join(path, rfam.rfam_file_name), 'wb') as fout:
                    shutil.copyfileobj(fin, fout)

            # run cmpress to create binary files needed to run cmscan
            try:
                run_cmpress(os.path.join(path, rfam.rfam_file_name))
            except exceptions.CmpressException as e:
                ml.error(str(e))
                ml.error('The Rfam file might be corrupt. Please check following output to get more information.\n')
                print(e.errors)
                sys.exit(1)

        return os.path.join(path, rfam.rfam_file_name)
Exemplo n.º 24
0
def run_muscle(fasta_file, out_file=None, muscle_params='', reorder=True):
    """
    beware, muscle does not keep sequence order and the --stable switch is broken
    :param fasta_file:
    :param out_file:
    :param muscle_params:
    :param reorder:
    :return:
    """
    ml.info('Running muscle.')
    ml.debug(fname())
    if out_file:
        cl_file = out_file
    else:
        cl_fd, cl_file = mkstemp(prefix='rba_',
                                 suffix='_07',
                                 dir=CONFIG.tmpdir)
        os.close(cl_fd)

    cmd = [
        '{}muscle'.format(CONFIG.muscle_path), '-clwstrict', '-seqtype', 'rna',
        '-out', cl_file, '-in', fasta_file, '-quiet'
    ]
    if muscle_params != '':
        cmd += [' '.join([shlex.quote(i) for i in shlex.split(muscle_params)])]
    ml.debug(cmd)

    with TemporaryFile(mode='w+', encoding='utf-8') as tmp:
        r = call(cmd, stdout=tmp, stderr=tmp)
        if r:
            msgfail = 'Call to muscle failed.'
            ml.error(msgfail)

            tmp.seek(0)
            raise exceptions.MuscleException(msgfail, tmp.read())

        if reorder:
            # reorder sequences acording to input file
            with open(fasta_file, 'r') as ff, open(cl_file, 'r+') as oo:
                orig_seqs = [i.id for i in SeqIO.parse(ff, format='fasta')]
                muscle_align = {
                    i.id: i
                    for i in AlignIO.read(oo, format='clustal')
                }

                # reorder
                reo_alig = []
                for s_name in orig_seqs:
                    # muscle cuts names
                    reo_alig.append(muscle_align[s_name[:32]])
                alig = AlignIO.MultipleSeqAlignment(reo_alig)
                # write
                oo.seek(0)
                AlignIO.write(alig, oo, format='clustal')
                oo.truncate()

        return cl_file
Exemplo n.º 25
0
def run_turbofold(sequences, params, timeout=None):
    ml.info('Running Turbofold.')
    ml.debug(fname())
    try:
        return _turbofold_worker(sequences, params, timeout=timeout)
    except exceptions.TurboFoldException as e:
        return e
    except AssertionError as e:
        return e
Exemplo n.º 26
0
def filter_by_bits(blast_hitlist, getter, filter_conditions):
    ml.debug(fname())

    result = blast_hitlist
    for relation, condition in filter_conditions:
        result = [
            h for h in result
            if OPERATIONS[relation](getter(h).bits, condition)
        ]
    return result
Exemplo n.º 27
0
def create_report_object_from_locarna(exp_hit, locarna_alig):
    """
    create object which will be appended to BlastSearchRecompute class
    This needs to be Subsequences object

    :param exp_hit:
    :param locarna_alig:
    :return:
    """
    ml.debug(fname())
    # chop alignment by seq
    query_ind = [i for i, j in enumerate(locarna_alig) if j.id == 'query']
    if len(query_ind) != 1:
        raise exceptions.SubseqMatchError('Got multiple hits with id "query" in the Locarna alignment.')
    trimmed_locarna_alig = trim_alignment_by_sequence(
        locarna_alig,
        str(locarna_alig[query_ind[0]].seq),
        structure_annotation='SS_cons'
    )

    aligned_subsequence = BA_support.select_analyzed_aligned_hit(trimmed_locarna_alig, exp_hit.id)

    # add annotations from exp hit
    aligned_subsequence.annotations = exp_hit.annotations
    aligned_subsequence.name = exp_hit.name

    # also add annotations from locarna, mainly score
    aligned_subsequence.annotations.update(locarna_alig.annotations)

    # get the structure
    # by refold
    refold_structures = refold_stockholm(trimmed_locarna_alig, trimmed_locarna_alig.column_annotations['SS_cons'])

    # select refold structure for my seq
    seq_refold_structure = _select_refold_structure(refold_structures, exp_hit.id)

    aligned_subsequence.letter_annotations['ss0'] = seq_refold_structure.letter_annotations['ss0']
    aligned_subsequence.annotations['sss'] = ['ss0']

    # prepare seq_record for subsequences
    aligned_subsequence.description = ''
    hit = BA_support.Subsequences(exp_hit)

    hit.extension = aligned_subsequence

    # find the matching sequence
    pos_match = re.search(str(aligned_subsequence.seq), str(exp_hit.seq), flags=re.IGNORECASE)
    if not pos_match:
        raise exceptions.SubseqMatchError(
            'Aligned portion of subject sequence in Locarna alignment was not found in parent sequence.'
        )

    hit.best_start, hit.best_end = compute_true_location_locarna(hit, pos_match)

    return hit
Exemplo n.º 28
0
def get_cm_model(query_file, params=None, threads=None):
    ml.debug(fname())
    cmscan_data = get_cm_model_table(query_file, params, threads)
    best_model_row = select_best_matching_model_from_cmscan(cmscan_data)
    if best_model_row is None:
        return None

    best_model = best_model_row['target_name']

    ml.info('Best matching model: {}'.format(best_model))
    return best_model
Exemplo n.º 29
0
def centroid_homfold_fast(all_seqs, query, all_seqs_fasta, n, centroid_homfold_params, len_diff):
    ml.debug(fname())

    selected_seqs = centroid_homfold_fast_prep(all_seqs, query, n, len_diff)

    ch, homologous_file = mkstemp(prefix='rba_', suffix='_74', dir=CONFIG.tmpdir)
    with os.fdopen(ch, 'w') as h:
        SeqIO.write(selected_seqs, h, 'fasta')

    structures, _ = me_centroid_homfold(all_seqs_fasta, homologous_file, params=centroid_homfold_params)
    BA_support.remove_one_file_with_try(homologous_file)
    return structures
Exemplo n.º 30
0
def centroid_homfold_fast_prep(all_seqs, query, n, len_diff):
    ml.debug(fname())

    assert n >= 1, "Number of sequences for centroid-fast must be greater then 0."

    if query.annotations['ambiguous']:
        msgfail = "Query sequence contains ambiguous characters. Can't use centroid-fast."
        ml.error(msgfail)
        raise AmbiguousQuerySequenceException(msgfail)

    nr_na_ld = BA_support.sel_seq_simple(all_seqs, query, len_diff)
    nr_na_ld_n = nr_na_ld[:int(n)]
    return nr_na_ld_n