예제 #1
0
 def tearDown(self):
     files = glob.glob(blast_in + '.r-*')
     remove_files_with_try(
         [
             test_output_file,
         ] + files
     )
예제 #2
0
def _run_hybrid_ss_min_single(file_path, P, W, M):
    with TemporaryFile(mode='w', encoding='utf-8') as tmp:
        cmd3 = [
            '{}hybrid-ss-min'.format(CONFIG.mfold_path), '--suffix=DAT',
            '--NA=RNA', '--noisolate',
            '--mfold=' + str(P) + ',' + str(W) + ',' + str(M), file_path
        ]
        ml.debug(cmd3)

        rt = call(cmd3, cwd=os.path.dirname(file_path), stdout=tmp, stderr=tmp)

        if rt:
            msgfail = 'Execution of hybrid-ss-min failed'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.HybridssminException(msgfail, tmp.read())

        if not os.path.isfile(file_path + '.ct'):
            msgfail = 'Execution of hybrid-ss-min failed - not output file'
            ml.error(msgfail)
            tmp.seek(0)
            raise exceptions.HybridssminException(msgfail, tmp.read())

        with open(file_path + '.ct', 'r') as sout:
            pred_structures = ct2db(sout)

        remove_files_with_try([
            file_path + '.run',
            file_path + '.plot',
            file_path + '.dG',
            file_path + '.ct',
            file_path + '.ann',
        ])
        return pred_structures
예제 #3
0
def subopt_fold_alifold(all_fasta_hits_file,
                        homologs_file,
                        aligner='muscle',
                        params=None,
                        threads=None):
    """
    run clustal/muscle on selected homologs file
    :return:
    """
    ml.debug(fname())
    if params is None:
        params = dict()
    # run aligner
    # =================================================================================================================
    if 'clustalo' == aligner:
        clustal_params = ' --outfmt=clustal --force'
        clustal_params += params.get('clustalo', '')

        if threads:
            clustal_params += ' --threads={}'.format(threads)
        alig_file = compute_clustalo_clasic(homologs_file,
                                            clustalo_params=clustal_params)

    elif 'muscle' == aligner:
        alig_file = run_muscle(homologs_file,
                               muscle_params=params.get('muscle', ''),
                               reorder=False)

    else:
        raise KeyError(
            'provided key ({}) not recognized - avalible: "clustalo" "muscle"'.
            format(aligner))

    # run consensus prediction
    # =================================================================================================================
    alif_file = compute_alifold(alig_file,
                                alifold_params=params.get('alifold', ''))

    # possibly need to decode alifold structure
    alif_str = read_seq_str(alif_file)[0]
    consensus_structure = alif_str.letter_annotations['ss0']

    subs = run_hybrid_ss_min(all_fasta_hits_file,
                             mfold=params.get('mfold', (10, 2, 20)),
                             threads=threads)

    # now compute rna distance score
    if threads == 1:
        new_structures = []
        for seq in subs:
            new_structures.append(_helper_subopt(seq, consensus_structure))
    else:
        with multiprocessing.Pool(processes=threads) as pool:
            tuples = [(seq, consensus_structure) for seq in subs]
            new_structures = pool.starmap(_helper_subopt, tuples)

    remove_files_with_try([alif_file, alig_file])
    return new_structures
예제 #4
0
    def test_BA_one_pred_method_simple(self):
        a = base_script + [
            '--blast_in',
            self.blast_xml,
            '--blast_query',
            self.query_double,
            '--blast_db',
            blast_db,
            '--mode',
            'simple',
            '--blast_regexp',
            r'(?<=\|)[A-Z0-9]*\.?\d*$',
            '--b_type',
            'xml',
            '--html',
            self.html,
            '--json',
            self.json,
            '--csv',
            self.csv,
            '--pandas_dump',
            self.pandas_dump,
            '--prediction_method',
            'rnafold',
            '--enable_overwrite',
            '--threads',
            '2',
        ]
        bb = call(a, cwd=root)
        self.assertEqual(bb, 0)

        # we have two input query sequences so output files will be numbered

        t = tab_output_equal(
            csvfile=iter2file_name(self.csv, True, 0),
            jsonfile=iter2file_name(self.json, True, 0),
            pdfile=iter2file_name(self.pandas_dump, True, 0),
        )
        self.assertTrue(t)

        t = tab_output_equal(
            csvfile=iter2file_name(self.csv, True, 1),
            jsonfile=iter2file_name(self.json, True, 1),
            pdfile=iter2file_name(self.pandas_dump, True, 1),
        )
        self.assertTrue(t)

        remove_files_with_try([
            iter2file_name(self.csv, True, 0),
            iter2file_name(self.json, True, 0),
            iter2file_name(self.pandas_dump, True, 0),
            iter2file_name(self.html, True, 0),
            iter2file_name(self.csv, True, 1),
            iter2file_name(self.json, True, 1),
            iter2file_name(self.pandas_dump, True, 1),
            iter2file_name(self.html, True, 1)
        ], '')
예제 #5
0
 def tearDown(self):
     files = glob.glob(blast_in + '.r-*')
     db_files = glob.glob(blast_db + '*')
     remove_files_with_try([
         self.csv,
         self.json,
         self.fasta_structures,
         self.fasta,
         self.html,
     ] + files + db_files)
 def tearDown(self):
     files = glob.glob(
         os.path.join(fwd, test_data_dir, 'RF00001_short.blastout') +
         '.r-*')
     remove_files_with_try([
         self.html,
         self.csv,
         self.json,
         self.pandas_dump,
     ] + files)
예제 #7
0
    def tearDown(self):

        files = glob.glob(blast_in + '.r-*')
        remove_files_with_try(
            [
                self.csv,
                self.json,
                self.pandas_dump,
                self.html

            ] + files
        )
예제 #8
0
 def tearDown(self):
     files = glob.glob(blast_in + '.r-*')
     remove_files_with_try([test_html_file, self.log] + files)
예제 #9
0
 def tearDown(self):
     remove_files_with_try([
         self.blast_xml,
         self.query_double,
     ], '')
예제 #10
0
 def tearDown(self):
     files = glob.glob(blast_in + '.r-*')
     remove_files_with_try([
         test_output_file, self.csv, self.json, self.fasta,
         self.fasta_structures, self.test_backup_file
     ] + files)
예제 #11
0
def cmmodel_rnafold_c(allhits_fasta, cmmodel_file, threads=None, params=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta)

    cmalign_params = ''
    if threads:
        cmalign_params += '--cpu {}'.format(threads)

    if 'cmalign' in params and params['cmalign']:
        cmalign_params += ' ' + params['cmalign']

    if '--notrunc' not in cmalign_params:
        cmalign_params += ' --notrunc'

    # rnafold params
    rnafold_params = params.get('RNAfold', '-C')
    assert isinstance(rnafold_params,
                      str), "Incorrect parameters for RNAfold -C"
    if '-C' not in rnafold_params:
        # some parameters given but -C not present
        rnafold_params += ' -C'

    alig_file = run_cmalign_on_fasta(allhits_fasta_file,
                                     cmmodel_file,
                                     cmalign_params=cmalign_params)
    # multiple sequence cm align
    # split by sequence, then run the rest
    cm_alig = read_st(alig_file)

    remove_files_with_try([allhits_fasta_file, alig_file])

    structures = []
    for single_alig in trim_cmalign_sequence_by_refseq_one_seq(
            cm_alig, rs='SS_cons', convert2uppercase=True):
        out_alig, trimmed_seq = trim_and_repair_single_cm_alignment(
            single_alig)
        conserved_structure_pairs = find_nc_and_remove(
            str(trimmed_seq.seq), trimmed_seq.letter_annotations['dec_str'])
        trimmed_seq.letter_annotations[
            'constrains'] = conserved_structure_pairs

        # constraint prediction
        # write constraint file
        fd, temp_constraint_file = mkstemp(prefix='rba_',
                                           suffix='_41',
                                           dir=CONFIG.tmpdir)
        with os.fdopen(fd, 'w') as tmpf:
            tmpf.write('>{}\n{}\n{}\n'.format(
                trimmed_seq.id, str(trimmed_seq.seq),
                trimmed_seq.letter_annotations['constrains']))

        single_structure = rnafold_prediction(temp_constraint_file,
                                              params=rnafold_params)

        remove_one_file_with_try(temp_constraint_file)

        # trimmed_seq.letter_annotations['final'] = single_structure[0].letter_annotations['ss0']
        structures.append(single_structure[0])

    str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict)

    return str_out
예제 #12
0
def alifold_refold_prediction(nr_homologs_hits_fasta,
                              all_hits_fasta,
                              refold='refold',
                              threads=None,
                              params=None,
                              msa_alg='clustalo'):
    """
    return predicted structures for all hits based on provided sequence homologs
    ! beware, clustal mixes order of sequences in profile alignment, correct for it
    possible param keys: "clustal", "alifold", "clustalo_profile", "repred_unpaired_tr"
    """
    ml.debug(fname())
    nr_path, san_dict = sanitize_fasta_file(nr_homologs_hits_fasta)
    all_path, san_dict = sanitize_fasta_file(all_hits_fasta,
                                             used_dict=san_dict)

    if params is None:
        params = dict()

    ref_pred = ['refold', 'refold_rnafoldc', 'conserved_ss_rnafoldc']
    if refold not in ref_pred:
        raise Exception(
            'refold procedure not recognized: {}, possible values are {}'.
            format(refold, ' '.join(ref_pred)))

    cl_file = _aligner_block(nr_path, params, msa_alg, threads)

    # cannot rely on that, the order of a cl_file would be the same as the order of the nr_homolog_hits_file
    ali_file = compute_alifold(cl_file,
                               alifold_params=params.get('alifold', ''))

    consensus_record = read_seq_str(ali_file)[0]

    clustalo_profile_params = '--outfmt clustal '
    clustalo_profile_params += params.get('clustalo_profile', '')
    if threads:
        clustalo_profile_params += ' --threads {}'.format(threads)
    realign_file = run_clustal_profile2seqs_align(
        cl_file, all_path, clustalo_params=clustalo_profile_params)
    realign_alig = AlignIO.read(realign_file, format='clustal')

    # slice alignment ( get seqname from nr_homolog_hits_file, find it in the realign and slice the whole segment off
    #  take care that the id may be the same and it must be checked for multiple occurence

    first_nr_record = _parse_first_record_only(nr_path)

    realign_allseq_possition = [
        i for i, seq in enumerate(realign_alig) if seq.id == first_nr_record.id
    ]

    new_alig_for_refold = realign_alig[:realign_allseq_possition[-1]]
    old_alig_in_new = realign_alig[realign_allseq_possition[-1]:]

    orig_alignment = AlignIO.read(cl_file, format='clustal')

    first_original_alignment_record = orig_alignment[0]

    match_original_seq_in_new_alig = [
        i for i in old_alig_in_new
        if i.id == first_original_alignment_record.id
    ][0]

    mapping = _map_alignment_columns_from_profile_match(
        first_original_alignment_record, match_original_seq_in_new_alig)

    # map and repair structure when mapping is unbiguous
    cs_encode = encode_structure_unicode(
        consensus_record.letter_annotations['ss0'])
    new_consensus_structure_encoded = _repair_consensus_structure_by_maping(
        cs_encode,
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=49)
    new_consensus_structure_repaired = repair_structure_any_variant(
        new_consensus_structure_encoded)

    new_consensus_structure = decode_structure_unicode(
        new_consensus_structure_repaired)

    new_consensus_sequence = _repair_consensus_structure_by_maping(
        str(consensus_record.seq),
        mapping,
        len(match_original_seq_in_new_alig.seq),
        gap_char=ord('_'))

    # write new consensus to a file
    a_fd, new_alifold_consensus_file = mkstemp(prefix='rba_',
                                               suffix='_33',
                                               dir=CONFIG.tmpdir)
    with os.fdopen(a_fd, 'w') as f:
        f.write(new_consensus_sequence + '\n')
        f.write(new_consensus_structure + '\n')

    # write sliced alignment to a file
    sa_fd, sliced_alignment_file = mkstemp(prefix='rba_',
                                           suffix='_34',
                                           dir=CONFIG.tmpdir)
    with os.fdopen(sa_fd, 'w') as f:
        AlignIO.write(new_alig_for_refold, f, 'clustal')

    # now process the file, and map alignment to consensus structure
    if refold in ['refold', 'refold_rnafoldc']:
        refold_file = compute_refold(sliced_alignment_file,
                                     new_alifold_consensus_file)

        if refold == 'refold_rnafoldc':
            rnafold_parameters = params.get('RNAfold', '')
            if '-C' not in rnafold_parameters:
                rnafold_parameters += ' -C'

            seq_str = rnafold_prediction(refold_file,
                                         params=rnafold_parameters)

        else:
            seq_str = read_seq_str(refold_file)

        remove_one_file_with_try(refold_file)

    else:
        st_alig_file = build_stockholm_from_clustal_alig(
            sliced_alignment_file, new_alifold_consensus_file)
        repred_tr = str(params.get('repred_unpaired_tr', '9'))
        conseq_conserved = params.get('conseq_conserved', 1)

        seq_str = _refold_with_unpaired_conservation(
            st_alig_file,
            repred_tr=repred_tr,
            conseq_conserved=conseq_conserved)
        remove_one_file_with_try(st_alig_file)

    structures_out = desanitize_fasta_names_in_seqrec_list(seq_str, san_dict)

    remove_files_with_try([
        nr_path, all_path, sliced_alignment_file, new_alifold_consensus_file,
        cl_file, ali_file, realign_file
    ])

    return structures_out
예제 #13
0
 def tearDown(self):
     files = glob.glob(blast_in + '.r-*')
     remove_files_with_try([self.tmp_config_file] + files)
     shutil.rmtree(self.tmpdir)
예제 #14
0
def cmmodel_rnafold_c(allhits_fasta,
                      cmmodel_file,
                      threads=None,
                      params=None,
                      timeout=None):
    ml.debug(fname())
    if params is None:
        params = dict()

    allhits_fasta_file, san_dict = sanitize_fasta_file(allhits_fasta)

    cmalign_params = ''
    if threads:
        cmalign_params += '--cpu {}'.format(threads)

    if 'cmalign' in params and params['cmalign']:
        cmalign_params += ' ' + params['cmalign']

    if '--notrunc' not in cmalign_params:
        cmalign_params += ' --notrunc'

    # rnafold params
    rnafold_params = params.get('RNAfold', '-C')
    assert isinstance(rnafold_params,
                      str), "Incorrect parameters for RNAfold -C"
    if '-C' not in rnafold_params:
        # some parameters given but -C not present
        rnafold_params += ' -C'

    alig_file = run_cmalign_on_fasta(allhits_fasta_file,
                                     cmmodel_file,
                                     cmalign_params=cmalign_params,
                                     timeout=timeout)
    # multiple sequence cm align
    # split by sequence, then run the rest
    cm_alig = read_st(alig_file)

    remove_files_with_try([allhits_fasta_file, alig_file])

    # ===== use refold.pl directly ====
    cm_alig_upper = cm_alig.to_upper()
    fd, temp_mock_consensus = mkstemp(prefix='rba_',
                                      suffix='_41',
                                      dir=CONFIG.tmpdir)
    f, temp_clustal_aln = mkstemp(prefix='rba_',
                                  suffix='_42',
                                  dir=CONFIG.tmpdir)
    with os.fdopen(f, 'w') as h_clustal, os.fdopen(fd, 'w') as h_constraints:
        cm_alig_upper.write_clustal(h_clustal)

        h_constraints.write('{}\n{}\n'.format(
            re.sub('[^ACTGU]',
                   '_',
                   cm_alig_upper.column_annotations['RF'],
                   flags=re.IGNORECASE),
            cm_strucutre2br(cm_alig_upper.column_annotations['SS_cons'])))

    temp_constraint_file = compute_refold(temp_clustal_aln,
                                          temp_mock_consensus,
                                          timeout=timeout)
    structures = rnafold_prediction(temp_constraint_file,
                                    params=rnafold_params,
                                    timeout=timeout)
    str_out = desanitize_fasta_names_in_seqrec_list(structures, san_dict)

    remove_files_with_try(
        [temp_constraint_file, temp_clustal_aln, temp_mock_consensus])

    return str_out
예제 #15
0
def expand_hits(hits, blast_db, query_length, extra=0, blast_regexp=None, skip_missing=False, msgs=None):
    """takes list of blast.HSP objects as first argument and
    path to local blast database as second argument
    then it uses blastdbcmd from blast+ installation to obtain desired sequence
    Two temporary files are used in this call and are deleted at final stage
    :return list of SeqRecord objects (parsed fasta file)
    """
    ml.info('Retrieving sequence neighborhoods for blast hits.')
    ml.debug(fname())

    fd, temp_filename = mkstemp(prefix='rba_', suffix='_25', dir=CONFIG.tmpdir)
    fdb, blast_tempfile = mkstemp(prefix='rba_', suffix='_26', dir=CONFIG.tmpdir)
    os.close(fdb)
    exp_hits = []
    strand = []
    loc = []
    try:
        with os.fdopen(fd, 'w') as temp_file:
            for index, hit in enumerate(hits):
                # +1 here because blastdbcmd counts sequences from 1
                if hit[1].sbjct_end < hit[1].sbjct_start:
                    # this is hit to minus strand
                    start = hit[1].sbjct_end - _positive_index(query_length - hit[1].query_end) - extra
                    end = hit[1].sbjct_start + hit[1].query_start + extra - 1
                    strand.append(-1)
                    d = {'query_start': hit[1].sbjct_end, 'query_end': hit[1].sbjct_start,
                         'extended_start': hit[1].sbjct_end - _positive_index(query_length - hit[1].query_end),
                         'extended_end': hit[1].sbjct_start + hit[1].query_start - 1,
                         'strand': -1}
                else:
                    # this is hit to plus strand
                    start = hit[1].sbjct_start - hit[1].query_start + 1 - extra
                    end = hit[1].sbjct_end + _positive_index(query_length - hit[1].query_end) + extra
                    strand.append(1)
                    d = {'query_start': hit[1].sbjct_start, 'query_end': hit[1].sbjct_end,
                         'extended_start': hit[1].sbjct_start - hit[1].query_start + 1,
                         'extended_end': hit[1].sbjct_end + _positive_index(query_length - hit[1].query_end),
                         'strand': 1}

                # ====== information about possible trim ======
                # assume ok
                d['trimmed_ss'] = False
                d['trimmed_se'] = False
                d['trimmed_es'] = False
                d['trimmed_ee'] = False

                d['super_start'] = start
                d['super_end'] = end

                if start < 1:
                    start = 1                    # index from which sequence should be retrieved from the db
                    d['trimmed_ss'] = True

                # repair possible extended start violation
                if d['extended_start'] < 1:
                    d['trimmed_es'] = True

                # add blast record
                d['blast'] = hit

                try:
                    bdb_accession = match_acc(hit[0], blast_regexp)
                except exceptions.AccessionMatchException as e:
                    remove_files_with_try([temp_filename, blast_tempfile])
                    raise e

                d['blast'][0] = bdb_accession
                loc.append(d)

                temp_file.write(bdb_accession + ' ' + '-region ' + str(start) + '-' + str(end) + '\n')
    except RuntimeError as e:
        ml.error(str(e))
        sys.exit(1)

    cmd = [
        '{}blastdbcmd'.format(CONFIG.blast_path),
        '-dbtype',
        'nucl',
        '-db',
        str(blast_db),
        '-entry_batch',
        temp_filename,
        '-out',
        blast_tempfile
    ]
    ml.debug(cmd)

    try:
        pcall = Popen(
            cmd,
            stdout=PIPE,
            stderr=PIPE,
            universal_newlines=True
        )
        out, errs = pcall.communicate()
    except FileNotFoundError:
        msgfail = 'Unable to run blastdbcmd command, please check its availability.'
        ml.error(msgfail)
        remove_files_with_try([temp_filename, blast_tempfile])
        sys.exit(1)

    # inspect the stdout (the blastdbcmd returns exit code 1 even if only one sequence is missing)
    msgfail = 'Incomplete database. Some sequences not found in database.'
    msgfail += ' Details: ' + errs

    if errs and not skip_missing:
        ml.error(msgfail)
        remove_files_with_try([temp_filename, blast_tempfile])
        sys.exit(1)

    elif errs and skip_missing:
        ml.warning(msgfail)

    requested_ids = {l['blast'][0] for l in loc}
    obtained_ids = set()

    with open(blast_tempfile, 'r') as bf:
        index = 0
        for parsed_record in SeqIO.parse(bf, 'fasta'):
            record_id = parsed_record.id.split(':')[0]

            if parsed_record.description.startswith(parsed_record.id):
                parsed_record.description = parsed_record.description[len(parsed_record.id):].strip()

            obtained_ids.add(record_id)
            index = _get_correct_blast_hit(record_id, loc, index, skip=skip_missing)

            parsed_record.annotations = loc[index]
            parsed_record.annotations['msgs'] = []
            # add uid to ensure that all hits are unique
            parsed_record.id = 'uid:' + str(index) + '|' + record_id

            if loc[index]['trimmed_ss']:
                if loc[index]['super_start'] + len(parsed_record.seq) < loc[index]['super_end'] + loc[index]['super_start']:
                    parsed_record.annotations['trimmed_se'] = True
                    if loc[index]['trimmed_es']:
                        if len(parsed_record.seq) < loc[index]['extended_end'] + loc[index]['extended_start']:
                            parsed_record.annotations['trimmed_ee'] = True
                    else:
                        if len(parsed_record.seq) < loc[index]['extended_end'] - loc[index]['extended_start']:
                            parsed_record.annotations['trimmed_ee'] = True
            else:
                if loc[index]['super_start'] + len(parsed_record.seq) - 1 < loc[index]['super_end']:
                    parsed_record.annotations['trimmed_se'] = True
                    if loc[index]['super_start'] + len(parsed_record.seq) - 1 < loc[index]['extended_end']:
                        parsed_record.annotations['trimmed_ee'] = True

            msgsub = '{}: Sequence cannot be extended sufficiently'.format(record_id)
            if parsed_record.annotations['trimmed_ss']:
                msgwarn = msgsub + '. Missing {} nt upstream in the genome.'.format(parsed_record.annotations['super_start'])
                parsed_record.annotations['msgs'].append(msgwarn)
                ml.warning(msgwarn)
            if parsed_record.annotations['trimmed_se']:
                msgwarn = msgsub + '. Missing nt downstream in the genome.'.format(record_id)
                parsed_record.annotations['msgs'].append(msgwarn)
                ml.warning(msgwarn)
            if parsed_record.annotations['trimmed_es']:
                msgwarn = msgsub + ' by unaligned portion of query. THIS IS PROBABLY FRAGMENT!'
                msgwarn += ' Trimmed upstream.'
                parsed_record.annotations['msgs'].append(msgwarn)
                ml.warning(msgwarn)
            if parsed_record.annotations['trimmed_ee']:
                msgwarn = msgsub + ' by unalined portion of query. THIS IS PROBABLY FRAGMENT!'
                msgwarn += ' Trimmed downstream.'
                parsed_record.annotations['msgs'].append(msgwarn)
                ml.warning(msgwarn)

            exp_hits.append(parsed_record)
            index += 1

    remove_files_with_try(
        [temp_filename, blast_tempfile]
    )

    if len(requested_ids - obtained_ids) != 0:
        msgs.append('Incomplete database. Sequences with following ids were not found:')
        for m in requested_ids - obtained_ids:
            msgs.append(m)

    return exp_hits, strand