def shmir_from_fasta(siRNA, offtarget, regexp, original_frames, prefix): siRNA2 = reverse_complement(siRNA) frames = adjusted_frames(siRNA, siRNA2, 0, 0, deepcopy(original_frames)) # we do not have shifts here shmirs = [frame.template() for frame in frames] with allow_join_result(): foldings = group(fold.s(shmir, prefix=prefix).set(queue="subtasks") for shmir in shmirs).apply_async().get() results = [] iter_frames = izip(frames, original_frames, foldings) for frame, original_frame, folding in iter_frames: score = score_from_transcript(frame, original_frame, folding["ss"], offtarget, regexp) if validate_transcript_by_score(score): results.append({"score": score, "frame": frame, "folding": folding, "found_sequence": siRNA}) return results
def parse_input(sirna): """Function for checking many sequences and throw error if wrong input input limitations: possible letters: {ACTGUactgu}, change all 'u' to 't', length 19-21, one strand or two strands splitted by space, if two strands check if they are in correct 5'-3' orientation, allow |_20%_| mismatches, if the sequence is correct input returns 'first sequence' (19-21nt), 'second sequence' (19-21nt), left_end{-4,-3,-2,-1,0,1,2,3,4}, rigth_end{-4,-3,-2,-1,0,1,2,3,4} messages: * "correct sequence" * "changed 'u' to 't'" * "cut 'uu' or 'tt' ends" errors: * "too short" * "insert your siRNA sequence" * "too long" * "insert only one siRNA sequence or both strands of one siRNA at a time; check if both stands are in 5'-3' orientation" * "sequence can contain only {actgu} letters Args: sirna: sequence(str) which will be check Returns: tuple from best_complementarity Raises: ValidationError """ if " " in sirna: sequences = map(replace_mocules, sirna.split(" ", 1)) else: sequences = map( replace_mocules, [sirna, reverse_complement(sirna)] ) for sequence in sequences: validate_sirna(sequence) return best_complementarity(*sequences)
def shmir_from_sirna_score(input_str): """Main function takes string input and returns the best results depending on scoring. Single result include sh-miR sequence, score and link to 2D structure from mfold program Args: input_str(str): Input string contains one or two sequences. Returns: List of sh-miR(s) sorted by score. """ seq1, seq2, shift_left, shift_right = check_input(input_str) if not seq2: seq2 = reverse_complement(seq1) original_frames = db_session.query(Backbone).all() frames = get_frames(seq1, seq2, shift_left, shift_right, deepcopy(original_frames)) with allow_join_result(): frames_with_score = group( fold_and_score.s( seq1, seq2, frame_tuple, original, score_from_sirna, (seq1,) ).set(queue="subtasks") for frame_tuple, original in zip(frames, original_frames) ).apply_async().get() sorted_frames = [ elem[:-1] for elem in sorted( frames_with_score, key=operator.itemgetter(0), reverse=True ) if elem[0] > 60 ][:3] return sorted_frames
def shmir_from_fasta_string(fasta_string, original_frames, actual_offtarget, regexp_type, path): """Generating function of shmir from fasta string. Args: fasta_string(str): Sequence. original_frames(Backbone): original Backbone object. actual_offtarget(int): offtarget value regexp_type(int): Number of a regex from database. Returns: list of sh-miR(s) """ seq2 = reverse_complement(fasta_string) frames = get_frames(fasta_string, seq2, 0, 0, deepcopy(original_frames)) with allow_join_result(): frames_with_score = group( fold_and_score.s( fasta_string, seq2, frame_tuple, original, score_from_transcript, (actual_offtarget, regexp_type), path ).set(queue="subtasks") for frame_tuple, original in zip(frames, original_frames) ).apply_async().get() filtered_frames = [] for frame in frames_with_score: notes = frame[0] if notes['frame'] > 60 and notes['all'] > 100: frame[0] = notes['all'] filtered_frames.append(frame) return sorted(filtered_frames, key=operator.itemgetter(0), reverse=True) or None
def test_reverse_complement(self): sequence = "atcgatcg" reversed_sequence = "cgatcgat" result = reverse_complement(sequence) self.assertEqual(result, reversed_sequence)
def shmir_from_transcript_sequence( transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, immunostimulatory ): """Generating function of shmir from transcript sequence. Args: transcript_name(str): Name of transcipt. minimum_CG(int): Minimum number of 'C' and 'G' nucleotide in sequence. maximum_CG(int): Maximum number of 'C' and 'G' nucleotide in sequence. maximum_offtarget(int): Maximum offtarget. scaffold(str): Name of frame of miRNA or 'all'. stimulatory_sequences(str): One of 'yes', 'no', 'no_difference'. Returns: list of sh-miR(s). """ # check if results are in database results = get_results(transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, immunostimulatory) # sometimes results is an empty list if results is not None: return results path = create_path_string(transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, immunostimulatory) mRNA = ncbi_api.get_mRNA(transcript_name) reversed_mRNA = reverse_complement(mRNA) original_frames = frames_by_scaffold(scaffold) frames_by_name = {frame.name: frame for frame in original_frames} # best patters should be choosen first patterns = { frame.name: OrderedDict(sorted(json.loads(frame.regexp).items(), reverse=True)) for frame in original_frames } with allow_join_result(): validated = ( group( validate_sequences.s( list(sequences), # generators are not serializable regexp_type, name, minimum_CG, maximum_CG, maximum_offtarget, immunostimulatory, ).set(queue="score") for name, patterns_dict in patterns.iteritems() for regexp_type, sequences in find_by_patterns(patterns_dict, reversed_mRNA).iteritems() ) .apply_async() .get() ) best_sequences = merge_results(validated) with allow_join_result(): results = ( group( shmir_from_fasta.s( siRNA["sequence"], siRNA["offtarget"], siRNA["regexp"], [frames_by_name[name]], path ).set(queue="score") for name, siRNA in unpack_dict_to_list(best_sequences) ) .apply_async() .get() ) # merge results = list(chain(*results)) if not results: with allow_join_result(): validated = ( validate_sequences.s( list(all_possible_sequences(reversed_mRNA, 21)), # not serializable 0, "all", minimum_CG, maximum_CG, maximum_offtarget, immunostimulatory, ) .apply_async(queue="subtasks") .get() ) best_sequences = merge_results([validated]) with allow_join_result(): results = ( group( shmir_from_fasta.s( siRNA["sequence"], siRNA["offtarget"], siRNA["regexp"], original_frames, path ).set(queue="score") for name, siRNA in unpack_dict_to_list(best_sequences) ) .apply_async() .get() ) # merge results = chain(*results) sorted_results = sorted(results, key=lambda result: result["score"]["all"], reverse=True)[:TRANSCRIPT_RESULT_LIMIT] db_results = store_results( transcript_name, minimum_CG, maximum_CG, maximum_offtarget, scaffold, immunostimulatory, sorted_results ) remove_bad_foldings(path, [result.get_task_id() for result in db_results]) return [result.as_json() for result in db_results]