Exemplo n.º 1
0
def single_design(run_no, seed, seq_no):
    logging.info("Design - start - run {}, seq_no {}".format(run_no, seq_no))
    # send sequences to RNAfbinv
    # TODO: use new version of RNAfbinv
    designed_sequence = rna_designer.run_rnafbinv(
        target_structure,
        target_sequence,
        iter_no=1000,
        seed=seed,
        random_seed=random_gen.getrandbits(64))
    designed_structure_map = vienna.fold(designed_sequence)
    logging.info("Design - compare - run {}, seq_no {}".format(run_no, seq_no))
    # generate information from MFE
    designed_structure_mfe = designed_structure_map.get('MFE')
    designed_tree_mfe = shapiro_tree_aligner.get_tree(designed_structure_mfe,
                                                      designed_sequence)
    match_tree_mfe, match_score_mfe = shapiro_tree_aligner.align_trees(
        designed_tree_mfe, target_tree)
    # generate information from centroid
    designed_structure_centroid = designed_structure_map.get('centroid')
    designed_tree_centroid = shapiro_tree_aligner.get_tree(
        designed_structure_centroid, designed_sequence)
    match_tree_centroid, match_score_centroid = shapiro_tree_aligner.align_trees(
        designed_tree_centroid, target_tree)
    # print info to file
    out_text = "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
        run_no, seq_no, seed, designed_sequence, designed_tree_mfe,
        match_score_mfe, match_tree_mfe, designed_tree_centroid,
        match_score_centroid, match_tree_centroid)
    logging.info("Design - printing results:\n{}".format(out_text))
    design_logger.info(out_text)
    return seq_no, designed_sequence
Exemplo n.º 2
0
def analyze_res(sequence, target_tree):
    structure_map = vienna.fold(sequence)
    # generate information from MFE
    structure_mfe = structure_map.get('MFE')
    tree_mfe = shapiro_tree_aligner.get_tree(structure_mfe, sequence)
    match_tree_mfe, match_score_mfe = shapiro_tree_aligner.align_trees(
        tree_mfe, target_tree)
    # generate information from centroid
    structure_centroid = structure_map.get('centroid')
    tree_centroid = shapiro_tree_aligner.get_tree(structure_centroid, sequence)
    match_tree_centroid, match_score_centroid = shapiro_tree_aligner.align_trees(
        tree_centroid, target_tree)
    return tree_mfe, match_score_mfe, match_tree_mfe, tree_centroid, match_score_centroid, match_tree_centroid
Exemplo n.º 3
0
 def __init__(self,
              sequence: str,
              options: Dict[str, Any],
              calc_robusntess: bool = True):
     self.sequence = sequence
     fold_map = options.get('RNAfold').fold(sequence)
     self.fold_type = options.get('fold')
     self.energy = fold_map.get("{}_energy".format(options.get('fold')))
     self.structure = fold_map.get(options.get('fold'))
     if calc_robusntess:
         self.mutational_robustness = calculate_neutrality(
             self.sequence, self.structure, options)
     else:
         self.mutational_robustness = None
     target_tree = shapiro_tree_aligner.get_tree(
         options['target_structure'], options['target_sequence'])
     self.result_tree = shapiro_tree_aligner.get_tree(
         self.structure, self.sequence)
     self.align_tree, self.score = shapiro_tree_aligner.align_trees(
         self.result_tree, target_tree, options['alignment_rules'])
     # Add energy diff
     target_energy = options.get('target_energy')
     if target_energy is not None:
         self.score += abs(
             fold_map['{}_energy'.format(options.get('fold'))] -
             target_energy)
     # Add mutation robustness diff
     target_neutrality = options.get('target_neutrality')
     if target_neutrality is not None:
         self.score += abs(
             calculate_neutrality(sequence, self.structure, options) -
             target_neutrality) * 100
     self.tree_edit_distance = tree_aligner.get_align_tree_distance(
         self.align_tree)
     self.bp_dist = bp_distance(self.structure, options['target_structure'])
Exemplo n.º 4
0
def get_align_score(identifier: str, sequence: str, cm_path: str, target_tree):
    fasta_file = None
    try:
        fasta_file = infernal.generate_fasta({identifier: sequence})
        cm_struct, new_sequence = dive_statistics.get_cm_struct(
            cm_path, fasta_file.name)
        cm_tree = shapiro_tree_aligner.get_tree(cm_struct, new_sequence)
        _, score = shapiro_tree_aligner.align_trees(cm_tree, target_tree)
        return score
    finally:
        if fasta_file is not None:
            os.remove(fasta_file.name)
Exemplo n.º 5
0
def score_sequence(sequence: str, target_tree: tree_aligner.Tree,
                   options: Dict[str, Any]):
    # Align score tree alignment + sequence alignment
    fold_map = options.get('RNAfold').fold(sequence)
    structure = fold_map[options.get('fold')]
    tree, score = shapiro_tree_aligner.align_trees(
        shapiro_tree_aligner.get_tree(structure, sequence), target_tree,
        options['alignment_rules'])
    # Add energy diff
    target_energy = options.get('target_energy')
    if target_energy is not None:
        score += abs(fold_map['{}_energy'.format(options.get('fold'))] -
                     target_energy)
    # Add mutation robustness diff
    target_neutrality = options.get('target_neutrality')
    if target_neutrality is not None:
        score += abs(
            calculate_neutrality(sequence, structure, options) -
            target_neutrality) * 100
    return tree, score
Exemplo n.º 6
0
def run_search(run_code: str, designed_object):
    general_run_logger.info('Starting search {}'.format(run_code))
    cm_path = os.path.join(output_dir, '{}.cm'.format(run_code))
    if not infernal.generate_single_seq_cm(designed_object.sequence, cm_path,
                                           designed_object.structure):
        general_run_logger.error(
            'Failed to build covariance model. run code: {}\n{}\n{}'.format(
                run_code, designed_object.sequence, designed_object.structure))
        return
    results = infernal.search_cm(cm_path, NT_PATH)
    if results is None:
        general_run_logger.error('Search failed {} {}\n{}'.format(
            run_code, cm_path, designed_object.sequence))
        return
    general_run_logger.info('Finished search {}, {} results'.format(
        run_code, len(results)))
    for res_no, res in enumerate(results):
        try:
            sequence = res.get('sequence')
            structure = vienna.fold(sequence)['MFE']
            res_tree = shapiro_tree_aligner.get_tree(structure, sequence)
            tree, score = shapiro_tree_aligner.align_trees(
                res_tree, target_tree)
            if score < 300 and score % 100 < 30:
                general_run_logger.info(
                    'Adding result {}, score {} sequence {}'.format(
                        run_code, score, sequence))
                'seq code\tmatch no\tsequence\tstructure\tscore\ttarget id'
                result_logger.info('{}\t{}\t{}\t{}\t{}\t{}'.format(
                    run_code, res_no, sequence, structure, score,
                    res.get('identifier')))
            else:
                general_run_logger.warning(
                    'Score too low {} result no {}, score {}, sequence: {}'.
                    format(run_code, res_no, score, sequence))
        except Exception:
            general_run_logger.fatel(
                'Exception in search {}, res no {}, {}'.format(
                    run_code, res_no, res))
Exemplo n.º 7
0
if __name__ == "__main__":
    folder = "/DB/"
    if len(sys.argv) < 4:
        print(
            "Usage: search_runner.py <input sequence> <input structure> <<<amount of runs> <seeds per run> <designed sequence per seed>> | <sequence list>>"
        )
        sys.exit(-1)
    # gather parameters
    target_sequence = sys.argv[1]
    target_structure = sys.argv[2]
    # init random number generator
    random_gen = random.Random()
    # find maximum match
    target_tree = shapiro_tree_aligner.get_tree(target_structure,
                                                target_sequence)
    _, target_score = shapiro_tree_aligner.align_trees(target_tree,
                                                       target_tree)
    logging.basicConfig(
        level=logging.INFO,
        format=
        '%(levelname)s:%(asctime)s:Name[%(name)s]:Thread[%(thread)d] - %(message)s'
    )
    logging.info("Target score: {}. Target tree: {}".format(
        target_score, target_tree))
    output_dir = os.path.join(folder, "Output")
    # new method infernal has it's out index
    indexed_fasta_map = index_all(os.path.join(folder, "fasta_db"))
    # Start runs
    if len(sys.argv) == 6:
        # init log (result) files
        run_time_stamp = datetime.datetime.fromtimestamp(
            time.time()).strftime("%Y_%m_%d_%H_%M_%S")
Exemplo n.º 8
0
 def calc_score(source_sequence: str, source_structure: str) -> float:
     source_tree = shapiro_tree_aligner.get_tree(source_structure,
                                                 source_sequence)
     _, score = shapiro_tree_aligner.align_trees(source_tree, target_tree)
     return score
Exemplo n.º 9
0
def generate_clusters(match_file_path: str,
                      design_file_path: str,
                      target_tree,
                      is_filter_bacteria: bool = False,
                      mode: MODE = MODE.OLD) -> List[DesignGroup]:
    vienna_folder = None
    try:
        if mode == MODE.NEW:
            vienna_folder = vienna.LiveRNAfold()
            vienna_folder.start()
        design_group_map = {}
        with open(match_file_path,
                  'r') as match_file, open(design_file_path,
                                           'r') as design_file:
            seq_code_map = {}
            design_file.readline()
            for line in design_file:
                if line.strip() == '':
                    continue
                items = line.strip().split('\t')
                if mode == MODE.OLD:
                    seq_code_map[items[0]] = {
                        'sequence': items[2].strip(),
                        'structure': items[4].strip()
                    }
                else:
                    # new doesnt go through a filter so we will filter it now
                    code = '{}_{}'.format(items[0].strip(), items[1].strip())
                    sequence = items[3].strip()
                    structure = vienna_folder.fold(sequence)['MFE']
                    source_tree = shapiro_tree_aligner.get_tree(
                        structure, sequence)
                    _, score = shapiro_tree_aligner.align_trees(
                        source_tree, target_tree)
                    if score < FILTER_THRESHOLD:
                        seq_code_map[code] = {
                            'sequence': sequence,
                            'structure': structure
                        }
            match_file.readline()
            for line in match_file:
                if line.strip() == '':
                    continue
                items = line.strip().split('\t')
                design_id = items[0].strip()
                if seq_code_map.get(design_id) is None:
                    continue
                design_group = design_group_map.get(
                    design_id,
                    DesignGroup(design_id, seq_code_map.get(design_id)))
                if mode == MODE.OLD:
                    if not is_filter_bacteria or not check_ancestor(
                            'Bacteria',
                            get_tax_id(items[5].strip().split('/', 1)[0])):
                        design_group.add_match(
                            items[5].strip(), {
                                'identifier': items[5].strip(),
                                'sequence': items[2].strip(),
                                'round': 0
                            })
                else:
                    # new mode didnt save identifier, add just and replace on first search
                    design_group.add_match(
                        str(len(design_group.matches)), {
                            'identifier': len(design_group.matches),
                            'sequence': items[1].strip(),
                            'round': 0
                        })
                design_group_map[design_id] = design_group
    finally:
        if vienna_folder is not None:
            vienna_folder.close()
    return list(design_group_map.values())
Exemplo n.º 10
0
def recreate_cm(folder_path: str,
                design_code: str,
                desc_keywords: List[str],
                test_name: str,
                sequence: str,
                structure: str,
                identifier_keywords: List[str] = None,
                new_seq_map: Dict[str, str] = {},
                top_taxonomy: str = None,
                score_cutoff: int = 250):
    def keyword_relevant(check_item: str, keymap: List[str]) -> bool:
        if keymap is None:
            return True
        for key in keymap:
            if key.lower() in check_item.lower():
                return True
        return False

    def check_taxonomy(tax_id: Optional[int]) -> bool:
        if top_taxonomy is None:
            return True
        elif tax_id is None:
            return False
        return result_dive.check_ancestor(top_taxonomy, tax_id)

    def check_c_or_u(named_tree) -> Tuple[bool, bool]:
        _, score = shapiro_tree_aligner.align_trees(named_tree, c_tree)
        check_c = score < 1000
        _, score = shapiro_tree_aligner.align_trees(named_tree, u_tree)
        check_u = score < 1000
        return check_c, check_u

    tree = shapiro_tree_aligner.get_tree(
        ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....",
        "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNYNNNNNNNNNNNNN"
    )
    c_tree = shapiro_tree_aligner.get_tree(
        ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....",
        "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNCNNNNNNNNNNNNN"
    )
    u_tree = shapiro_tree_aligner.get_tree(
        ".....((((((((...(.(((((.......))))).)........((((((.......))))))..)))))))).....",
        "NNNNNNNNNNNNNUNNNNNNNNNNNNNNNNNNNNNNNNUNNNUNNNNNNNNNNNNNNNNNNNNNNUNNNNNNNNNNNNN"
    )
    gather_results = {}
    full_results = {}
    with open(os.path.join(folder_path, "FINAL_all_ext"), 'r') as input_all:
        input_all.readline()
        for line in input_all:
            if line.strip() == '':
                continue
            parts = line.strip().split('\t')
            try:
                tax_id = int(parts[11])
            except ValueError:
                tax_id = None
            if parts[0].strip() == design_code and keyword_relevant(parts[15], desc_keywords) and\
                    keyword_relevant(parts[1], identifier_keywords) and check_taxonomy(tax_id):
                read_sequence = new_seq_map.get(parts[1], parts[4])
                gather_results[parts[1]] = read_sequence
                full_results[parts[1]] = {
                    'sequence': read_sequence,
                    'rfam_eval': parts[7],
                    'tax_id': parts[11],
                    'gene_id': parts[13],
                    'gene_loc': parts[14],
                    'gene_desc': parts[15]
                }
    if len(gather_results) == 0:
        print("No results from design {} match keywords {}".format(
            design_code, desc_keywords))
        exit(0)
    try:
        cm = NamedTemporaryFile('w', suffix=".cm")
        cm_file_name = cm.name
        cm.close()
        sto = NamedTemporaryFile('w', suffix=".sto")
        sto_file_name = sto.name
        sto.close()
        real_cm = os.path.join(folder_path, "{}.cm".format(test_name))
        if not os.path.exists(real_cm):
            if not infernal.generate_single_seq_cm(sequence, cm_file_name,
                                                   structure):
                print(
                    "Failed to generate single sequence-structure cm file for {}"
                    .format(design_code))
                exit(-1)
            shutil.copy(cm_file_name, real_cm)
        if not infernal.align_sequences(gather_results,
                                        os.path.join(folder_path, real_cm),
                                        sto_file_name):
            print("Failed to generate alignment for {}".format(design_code))
            exit(-2)
        removed_names = set()
        cm_map = read_sto(sto.name)
        with open(os.path.join(folder_path, "{}.txt".format(test_name)),
                  'w') as tbl_out:
            tbl_out.write(
                'name\tscore\ttax id\trfam evalue\tgene id\tgene location\tgene description\tcm sequence\t'
                'cm structure\toriginal sequence\tguanine_bind\tadenine_bind\n'
            )
            for name, (cmsequence, cmstructure) in cm_map.items():
                named_tree = shapiro_tree_aligner.get_tree(
                    cmstructure, cmsequence)
                _, score = shapiro_tree_aligner.align_trees(named_tree, tree)
                is_c, is_u = check_c_or_u(named_tree)
                if score > score_cutoff:
                    removed_names.add(name)
                    print("Removing {} score {}\n{}\n{}".format(
                        name, score, cmsequence, cmstructure))
                else:
                    res_item = full_results.get(name)
                    tbl_out.write(
                        "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".
                        format(name, score, res_item.get('tax_id'),
                               res_item.get('rfam_eval'),
                               res_item.get('gene_id'),
                               res_item.get('gene_loc'),
                               res_item.get('gene_desc'), cmsequence,
                               cmstructure, res_item.get('sequence'), is_c,
                               is_u))
        clean_sto(removed_names, sto_file_name)
        real_sto = os.path.join(folder_path, "{}.sto".format(test_name))
        shutil.copy(sto_file_name, real_sto)
        dive_statistics.generate_r2r(folder_path, real_sto, force=True)

    finally:
        try:
            os.remove(cm_file_name)
        except:
            pass
        try:
            os.remove(sto_file_name)
        except:
            pass
Exemplo n.º 11
0
 def check_c_or_u(named_tree) -> Tuple[bool, bool]:
     _, score = shapiro_tree_aligner.align_trees(named_tree, c_tree)
     check_c = score < 1000
     _, score = shapiro_tree_aligner.align_trees(named_tree, u_tree)
     check_u = score < 1000
     return check_c, check_u
Exemplo n.º 12
0
#!/usr/bin/env python3
'''
Gather good indexes from sequence / structure
Usage: gather_index.py source_sequence source_structure target_sequence target_structure
'''

import sys
from rnafbinv import shapiro_tree_aligner

if len(sys.argv) < 5:
    print(
        "Usage: gather_index.py <source_sequence> <source_structure> <target_sequence> <target_structure>"
    )
    exit(-1)

source_sequence = sys.argv[1].strip("'").strip('"')
source_structure = sys.argv[2].strip("'").strip('"')
target_sequence = sys.argv[3].strip("'").strip('"')
target_structure = sys.argv[4].strip("'").strip('"')
source_tree = shapiro_tree_aligner.get_tree(source_structure, source_sequence)
target_tree = shapiro_tree_aligner.get_tree(target_structure, target_sequence)
aligned_tree, score = shapiro_tree_aligner.align_trees(source_tree,
                                                       target_tree)
matched, unmached = shapiro_tree_aligner.get_matching_indexes(aligned_tree)
print(matched)
print(unmached)
Exemplo n.º 13
0
def get_max_score(seq, struct):
    tree = shapiro_tree_aligner.get_tree(struct, seq)
    _, score = shapiro_tree_aligner.align_trees(tree, tree)
    return tree, score
Exemplo n.º 14
0
def simulated_annealing(options: Dict[str, Any]):
    if len(options) == 0:
        options.get('logger').fatal(
            "Options object was not properly initiated. ")
        return None
    alignment_rules = tree_aligner.AlignmentRules(
        delete_func=lambda tree_value, is_target: shapiro_tree_aligner.
        delete_shapiro_func(tree_value, is_target, options['reduced_bi']),
        cmp_func=shapiro_tree_aligner.cmp_shapiro_tree_values,
        merge_func=shapiro_tree_aligner.merge_shapiro_tree_values,
        minmax_func=min)
    options['alignment_rules'] = alignment_rules
    # init rng
    rng_seed = options.get('rng')
    if rng_seed is not None:
        random.seed(rng_seed)
    # init loop variables
    no_iterations = options.get('iter')
    no_lookahead = options.get('look_ahead')
    # init initial sequence
    current_sequence = options.get('starting_sequence')
    if current_sequence is None:
        if options.get('random'):
            current_sequence = generate_random_start(
                len(options['target_structure']),
                options['target_sequence'].replace('T', 'U'))
    else:
        current_sequence = current_sequence.replace('T', 'U')
    # Vienna starts the process
    vienna_sequence = vienna.inverse(
        options['target_structure'],
        vienna.inverse_seq_ready(options['target_sequence'], current_sequence))
    # vienna might fail initiation + removing any wildcard left
    if vienna_sequence is None or vienna_sequence == '':
        if options.get('starting_sequence') is None:
            current_sequence = generate_random_start(
                len(options['target_structure']),
                options['target_sequence'].replace('T', 'U'))
        else:
            current_sequence = generate_random_start(
                len(options['target_structure']),
                options.get('starting_sequence').replace('T', 'U'))
    else:
        current_sequence = generate_random_start(
            len(options['target_structure']),
            vienna_sequence.upper().replace('T', 'U'))
    #print("Structure: {}\nsequence: {}\nstart: {}\ninverse: {}".format(options['target_structure'],
    #                                                                   options['target_sequence'],
    #                                                                   options.get('starting_sequence'),
    #                                                                   current_sequence))
    final_result = current_sequence
    # setup target tree and get initial sequence score (and max score)
    target_tree = shapiro_tree_aligner.get_tree(options['target_structure'],
                                                options['target_sequence'])
    if not merge_motifs(target_tree, options.get('motifs')):
        shapiro_str = shapiro_generator.get_shapiro(
            options['target_structure']).shapiro
        logging.error(
            'Motif list does not match target structure {}\nTarget Shapiro:{}'.
            format(options.get('motifs'), shapiro_str))
        return None
    _, optimal_score = shapiro_tree_aligner.align_trees(
        target_tree, target_tree, options['alignment_rules'])
    match_tree, current_score = score_sequence(current_sequence, target_tree,
                                               options)
    best_score = current_score
    options.get('logger').info(
        'Initial sequence ({}): {}\nAlign tree: {}'.format(
            current_score, current_sequence, match_tree))
    updater = options.get('updater')
    # main loop
    for iter in range(0, no_iterations):
        if options.get('stop') is not None:
            return None
        if best_score == 0:
            break
        progress = False
        for look_ahead in range(0, no_lookahead):
            if options.get('stop') is not None:
                return None
            new_sequence = mutator.perturbate(current_sequence, match_tree,
                                              options)
            new_tree, new_score = score_sequence(new_sequence, target_tree,
                                                 options)
            temperature = calc_temp(iter, no_iterations)
            probability = acceptance_probability(current_score, new_score,
                                                 temperature,
                                                 len(current_sequence))
            options.get('logger').debug(
                "iteration {} - TEMP: {} PROBABILITY: {}".format(
                    iter + 1, temperature, probability))
            if random.random() < probability:
                progress = True
                break
            ''' OLD method, decays very fast (new is boltzman probability)
            if new_score < current_score:
                progress = True
                break
            elif random.random() < (2.0 / (iter + 1.0) / no_lookahead):
                progress = True
                break
            '''
        if progress:
            current_sequence = new_sequence
            current_score = new_score
            match_tree = new_tree
        if current_score <= best_score:
            best_score = current_score
            final_result = current_sequence
        options.get('logger').debug(
            'Iteration {} current sequence ({}): {}\nAlign tree: {}'.format(
                iter + 1, current_score, current_sequence, match_tree))
        if updater is not None:
            updater.update(iter + 1)
    # final print

    return final_result