def aptamer_structs_aff(fileNames, seqLength, roundNum, rounds='final'): if (rounds == 'final'): top_seq_info = [0, 0, np.infty] with open(fileNames + "_R" + str(roundNum), 'r') as f: for line in f: row = line.split() seq = str(row[0]) count = int(row[1]) dist = int(row[2]) if (dist < top_seq_info[2]): top_seq_info[0] = seq top_seq_info[1] = count top_seq_info[2] = dist with open(fileNames + "_R" + str(roundNum) + "_affstructure_info", 'w') as f: seq = top_seq_info[0] seq_struct = fold(seq)[0] seq_mfe = fold(seq)[1] seq_count = top_seq_info[1] seq_dist = top_seq_info[2] f.write(seq + '\t' + seq_struct + '\t' + str(seq_mfe) + '\t' + str(seq_count) + '\t' + str(seq_dist) + '\n') svg_rna_plot(seq, seq_struct, fileNames + "_R" + str(roundNum) + "_affstructure.svg") return 0 elif (rounds == 'all'): top_seqs_info = [] for rnd in xrange(roundNum): with open(fileNames + "_R" + str(rnd + 1), 'r') as f: for line in f: row = line.split() seq = str(row[0]) count = int(row[1]) dist = int(row[2]) if (dist > top_seq_info[2]): top_seqs_info.append([seq, count, dist]) with open(fileNames + "_R" + str(roundNum) + "_affstructures_info", 'w') as f: for rnd in xrange(roundNum): seq = top_seqs_info[rnd][0] seq_struct = fold(seq)[0] seq_mfe = fold(seq)[1] seq_count = top_seqs_info[rnd][1] seq_dist = top_seqs_info[rnd][2] f.write(seq + '\t' + seq_struct + '\t' + str(seq_mfe) + '\t' + str(seq_count) + + '\t' + str(seq_dist) + '\n') svg_rna_plot( seq, seq_struct, fileNames + "_R" + str(rnd + 1) + "_affstructure.svg") return 0 else: print("invalid option for string varible rounds. Exiting...")
def stochasticLoopSelection_initial(self, alphabetSet, seqLength, aptPool, selectionThreshold, totalSeqNum, samplingSize, outputFileNames, rnd, stringency): #sampling print("sampling from initial library...") randomSamples = random.randint(0, int(totalSeqNum - 1), size=samplingSize) sampleFileName = outputFileNames + "_samples_R" + str(rnd) with open(sampleFileName, 'w') as s: for seqIdx in randomSamples: seq = Apt.pseudoAptamerGenerator(seqIdx, alphabetSet, seqLength) s.write(seq + '\n') print("Sampling completed") #initialize seqInfo matrix slctdSeqs = {} selectedSeqs = 0 aptStruct = fold(aptPool)[0] aptLoop = apt_loopFinder(aptPool, aptStruct, seqLength) print("Selection has started") #stochastic selection until threshold is met slctdSeqs = self.selectionProcess_loop_initial(slctdSeqs, aptPool, aptStruct, aptLoop, selectionThreshold, alphabetSet, seqLength, totalSeqNum, stringency) print("sequence selection has been carried out") return slctdSeqs
def loop_func(self, seq1, seq1_struct, seq1_loop, seq2, seqLength): seq2_struct = fold(seq2)[0] base = None baseIdx = 0 while(base != ')' and baseIdx < seqLength-1): base = seq2_struct[baseIdx] baseIdx += 1 if(baseIdx == seqLength-1): while(base != '(' and baseIdx > 0): base = seq2_struct[baseIdx-1] baseIdx -= 1 if(baseIdx == 0): seq2_loop = seq2 else: seq2_loop = seq2[baseIdx:] else: loop_end = baseIdx-1 while(base != '('): baseIdx -= 1 base = seq2_struct[baseIdx-1] seq2_loop = seq2[baseIdx:loop_end] seq2_loopDist = self.lavenshtein_func(seq1_loop, seq2_loop) seq2_bpDist = bp_distance(seq1_struct, seq2_struct) seq2_dist = int(seq2_loopDist + seq2_bpDist) return seq2_dist
def loop_func(self, seq1, seq1_struct, seq1_loop, seq2, seqLength): #compute secondary structure of sequence seq2_struct = fold(seq2)[0] base = None baseIdx = 0 #find a 3' paired nucleotide while (base != ')' and baseIdx < seqLength - 1): base = seq2_struct[baseIdx] baseIdx += 1 if (baseIdx == seqLength - 1): while (base != '(' and baseIdx > 0): base = seq2_struct[baseIdx - 1] baseIdx -= 1 if (baseIdx == 0): #sequence doesnt have a loop seq2_loop = seq2 else: #sequence loop is dangling end seq2_loop = seq2[baseIdx:] else: #sequence has a loop loop_end = baseIdx - 1 while (base != '('): baseIdx -= 1 base = seq2_struct[baseIdx - 1] #grab loop seq2_loop = seq2[baseIdx:loop_end] #compute Lavenshtein distance seq2_loopDist = self.lavenshtein_func(seq1_loop, seq2_loop) #compute BP distance seq2_bpDist = bp_distance(seq1_struct, seq2_struct) #sum distances seq2_dist = int(seq2_loopDist + seq2_bpDist) return seq2_dist
def loop_components_func(self, seq1, seq1_struct, seq1_loop, seq2, seqLength): seq2_struct = fold(seq2)[0] base = None baseIdx = 0 while (base != ')' and baseIdx < seqLength - 1): base = seq2_struct[baseIdx] baseIdx += 1 if (baseIdx == seqLength - 1): while (base != '(' and baseIdx > 0): base = seq2_struct[baseIdx - 1] baseIdx -= 1 if (baseIdx == 0): seq2_loop = seq2 else: seq2_loop = seq2[baseIdx:] else: loop_end = baseIdx - 1 while (base != '('): baseIdx -= 1 base = seq2_struct[baseIdx - 1] seq2_loop = seq2[baseIdx:loop_end] seq2_loopDist = self.lavenshtein_func(seq1_loop, seq2_loop) seq2_bpDist = bp_distance(seq1_struct, seq2_struct) return seq2_loopDist, seq2_bpDist
def _get_reward(self, terminal): """ Compute the reward after assignment of all nucleotides. Args: terminal: Bool defining if final timestep is reached yet. Returns: The reward at the terminal timestep or 0 if not at the terminal timestep. """ if not terminal: return 0 folded_design, _ = fold(self.design.primary) hamming_distance = hamming(folded_design, self.target.dot_bracket) if 0 < hamming_distance < self._env_config.mutation_threshold: hamming_distance = self._local_improvement(folded_design) normalized_hamming_distance = hamming_distance / len(self.target) # For hparam optimization episode_info = EpisodeInfo( target_id=self.target.id, time=time.time(), normalized_hamming_distance=normalized_hamming_distance, ) self.episodes_info.append(episode_info) return (1 - normalized_hamming_distance)**self._env_config.reward_exponent
def _local_improvement(self, folded_design): """ Compute Hamming distance of locally improved candidate solutions. Returns: The minimum Hamming distance of all imporved candidate solutions. """ differing_sites = _string_difference_indices(self.target.dot_bracket, folded_design) hamming_distances = [] for mutation in product("AGCU", repeat=len(differing_sites)): mutated = self.design.get_mutated(mutation, differing_sites) folded_mutated, _ = fold(mutated.primary) hamming_distance = hamming(folded_mutated, self.target.dot_bracket) hamming_distances.append(hamming_distance) if hamming_distance == 0: # For better timing results return 0 return min(hamming_distances)
def distance_range(scale, ref_seq, seqLength, alphabetSet): ref_struct = fold(ref_seq)[0] ref_loop = apt_loopFinder(ref_seq, ref_struct) hamm_dist_array = np.zeros(int(seqLength * 1.5)) bp_dist_array = np.zeros(int(seqLength * 1.5)) loop_dist_array = np.zeros(int(seqLength * 1.5)) randIdxs = random.randint(0, 4**(20) - 1, size=scale) for i in xrange(scale): randIdx = randIdxs[i] randSeq = apt.pseudoAptamerGenerator(randIdx, alphabetSet, seqLength) randHammDist = d.hamming_func(randSeq, ref_seq) randbpDist = d.bp_func(ref_struct, randSeq) randLoopDist = d.loop_func(ref_seq, ref_struct, ref_loop, randSeq, seqLength) hamm_dist_array[randHammDist] += 1 bp_dist_array[randbpDist] += 1 loop_dist_array[randLoopDist] += 1 for dist in xrange(int(seqLength * 1.5)): hamm_dist_array[dist] /= scale bp_dist_array[dist] /= scale loop_dist_array[dist] /= scale fig, axis = plt.subplots(1, 1) distAxis = np.linspace(0, int(seqLength + 9), int(seqLength + 10)) distAxis_smooth = np.linspace(0, int(seqLength + 9), 200) hamm_dist_smooth = spline(distAxis, hamm_dist_array, distAxis_smooth) bp_dist_smooth = spline(distAxis, bp_dist_array, distAxis_smooth) loop_dist_smooth = spline(distAxis, loop_dist_array, distAxis_smooth) axis.plot(distAxis_smooth, hamm_dist_smooth, label='Hamming') axis.plot(distAxis_smooth, bp_dist_smooth, label='Base-Pair') axis.plot(distAxis_smooth, loop_dist_smooth, label='Loop') axis.set_xlim([0, 25]) axis.set_ylim([0, 0.4]) axis.legend() fig.text(0.5, 0.04, 'Distance', ha='center') fig.text(0.04, 0.5, 'Fractional Frequency', va='center', rotation='vertical') fig.text(0.5, 0.95, 'Distance Distributions', ha='center') fig.savefig("SELEX_Analytics_distance_distributions", format='pdf') return hamm_dist_array
def check_reverse_rnafold(**kwargs): ''' RNAfold is directional aware. Therefore generated rna graphs need to consider both directions. :param kwargs: :return: ''' length = kwargs.get('length', 32) size = kwargs.get('size', 2e7) if not os.path.exists( os.path.join(basedir, 'data', 'rna_dataset_%d.csv' % (length))): generate_seq_dataset(size, length) with open(os.path.join(basedir, 'data', 'rna_dataset_%d.csv' % (length)), 'r') as f: reader = pd.read_csv(f) seq_list = reader['seq'] struct_list = reader['struct'] for seq, struct in zip(seq_list, struct_list): reversed_struct = fold(seq[::-1])[0] if struct[::-1] != reversed_struct: print(seq, struct, reversed_struct)
def _predict_rnalib(fasta_entry): from RNA import fold return (*fasta_entry, *fold(fasta_entry[1]))
def bp_func(self, seq1_struct, seq2): seq2_struct = fold(seq2)[0] seq2_dist = bp_distance(seq1_struct, seq2_struct) return seq2_dist