def mature_generator(lines): global map_data # each loop should read exactly 3 lines output_list = [] iterator = 0 while 1: if iterator == len(lines): break line_info = lines[iterator].strip() if line_info == "": break line_seq = lines[iterator + 1].strip() line_db = lines[iterator + 2].strip() iterator += 3 # if no read data is matched in putative precursors, discard it if DISCARD_NO_READ_PREC_FLAG: no_read_prec_flag = SeqModule.check_no_read_prec( line_info, map_data, MIN_READ_COUNT_THRESHOLD) if no_read_prec_flag is True: continue # check conserved sequence with blastn # if this line_info is classified as conserved sequence, update line_info # no need to find duplex, just mark 5p and 3p index corresponding to matched information updated_flag = False if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True': line_info, updated_flag = SeqModule.check_conserved_seq( line_info, line_seq, blastn_path, mirbase_path, ARM_EXTEND_THRESHOLD) # if updated_flag is True: # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db) # else, do the code below ########################################################### # Discard non-canonical (i.e. "hard to identify") precursor # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs # if ")" portion is large in "left side", it's non-canonical line_db_left = line_db[0:len(line_db) / 2] num_open = line_db_left.count("(") num_close = line_db_left.count(")") if float(num_close) / num_open > NON_CANONICAL_PREC_FACTOR: continue # find valid star sequence from putative precursors start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2( line_db, MATURE_MIN_LEN, MATURE_MAX_LEN, MAX_SERIAL_MISMATCH, MAX_MULT_MISMATCH, MAX_SERIAL_BULGE, MAX_MULT_BULGE) if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0: # star seq not found continue # write putative precursor to the output file output_form = SeqModule.generate_output_form(line_info, line_seq, line_db, start_5p, start_3p, end_5p, end_3p, map_data, MIN_READ_COUNT_THRESHOLD) output_list.append(output_form) return output_list
def mature_generator(lines): global map_data # each loop should read exactly 3 lines output_list=[] iterator = 0 while 1: if iterator == len(lines): break line_info = lines[iterator].strip() if line_info == "": break line_seq = lines[iterator+1].strip() line_db = lines[iterator+2].strip() iterator += 3 # if no read data is matched in putative precursors, discard it if DISCARD_NO_READ_PREC_FLAG: no_read_prec_flag = SeqModule.check_no_read_prec(line_info, map_data, MIN_READ_COUNT_THRESHOLD) if no_read_prec_flag is True: continue # check conserved sequence with blastn # if this line_info is classified as conserved sequence, update line_info # no need to find duplex, just mark 5p and 3p index corresponding to matched information updated_flag = False if ANNOTATE_FLAG == 'true' or ANNOTATE_FLAG == 'True': line_info, updated_flag = SeqModule.check_conserved_seq(line_info, line_seq, blastn_path, mirbase_path, ARM_EXTEND_THRESHOLD) # if updated_flag is True: # start_5p, end_5p, start_3p, end_3p = SeqModule.find_location(line_info, line_seq, line_db) # else, do the code below ########################################################### # Discard non-canonical (i.e. "hard to identify") precursor # "Asymmetric" dot-bracket notation precursor : low accuracy, hard to identify star seq, and too many outputs # if ")" portion is large in "left side", it's non-canonical line_db_left = line_db[0:len(line_db)/2] num_open = line_db_left.count("(") num_close = line_db_left.count(")") if float(num_close)/num_open > NON_CANONICAL_PREC_FACTOR: continue # find valid star sequence from putative precursors start_5p, end_5p, start_3p, end_3p = SeqModule.star_identifier_v2(line_db, MATURE_MIN_LEN, MATURE_MAX_LEN, MAX_SERIAL_MISMATCH, MAX_MULT_MISMATCH, MAX_SERIAL_BULGE, MAX_MULT_BULGE) if start_5p == 0 and end_5p == 0 and start_3p == 0 and end_3p == 0: # star seq not found continue # write putative precursor to the output file output_form = SeqModule.generate_output_form(line_info, line_seq, line_db, start_5p, start_3p, end_5p, end_3p, map_data, MIN_READ_COUNT_THRESHOLD) output_list.append(output_form) return output_list
def precursor_generator(lines): output_precursor_infolist = [] output_precursor_dblist = [] reads_total_partial = 0 length_distribution_partial = count_list({}) for z in range(0, len(lines)): line_split = lines[z].split() # Rare occasion of improper line data : should skip it if len(line_split) != 7: continue # accumulate raw rna seq read counts for calculation of RPM reads_total_partial += int(line_split[1]) # accumulate length distribution imformation # specify 5' end and add to the corresponding index seq_dist_check = line_split[6] five_prime = seq_dist_check[0] seq_length = len(seq_dist_check) dict_key = str(seq_length)+str(five_prime) length_distribution_partial[dict_key] += int(line_split[1]) # Screen for Drosha / Dicer cutting sites (Inspired by miREAP) qualified_flag = 1 name_list_index = ref_name_list.index(line_split[2]) count = 0 if line_split[5] == "+": count = ref_count_list_pos[name_list_index][int(line_split[3])] if count < 3: continue count_region = count count_sites = count for i in range(1, 20): if int(line_split[3])-i < 0 or int(line_split[3])+i >= len(ref_seq_list[name_list_index]): continue if ref_count_list_pos[name_list_index][int(line_split[3])-i] > count \ or ref_count_list_pos[name_list_index][int(line_split[3])+i] > count: qualified_flag = 0 break count_region += ref_count_list_pos[name_list_index][int(line_split[3])-i] count_region += ref_count_list_pos[name_list_index][int(line_split[3])+i] if i < 3: count_sites += ref_count_list_pos[name_list_index][int(line_split[3])-i] count_sites += ref_count_list_pos[name_list_index][int(line_split[3])+i] if float(count_sites)/count_region < DOMINANT_FACTOR or float(count)/count_sites < DOMINANT_FACTOR/2.0: qualified_flag = 0 elif line_split[5] == "-": count = ref_count_list_neg[name_list_index][int(line_split[4])] if count < 3: continue count_region = count count_sites = count for i in range(1, 20): if int(line_split[4])-i < 0 or int(line_split[4])+i >= len(ref_seq_list[name_list_index]): continue if ref_count_list_neg[name_list_index][int(line_split[4])-i] > count \ or ref_count_list_neg[name_list_index][int(line_split[4])+i] > count: qualified_flag = 0 break count_region += ref_count_list_neg[name_list_index][int(line_split[4])-i] count_region += ref_count_list_neg[name_list_index][int(line_split[4])+i] if i < 3: count_sites += ref_count_list_neg[name_list_index][int(line_split[4])-i] count_sites += ref_count_list_neg[name_list_index][int(line_split[4])+i] if float(count_sites)/count_region < DOMINANT_FACTOR or float(count)/count_sites < DOMINANT_FACTOR/2.0: qualified_flag = 0 if qualified_flag == 0: continue # Precursor Candidate Information Variable List pc_seq = "" pc_structure = "" pc_start = 0 pc_end = 0 pc_abs_energy = 0 pc_norm_abs_energy = 0 # Find min. MFE of fold structure and save it to result_precursor # WARNING : if abs. of calculated free energy is less than 10, output[2] does not contain proper value # Skipping this precursor line is proper, since threshold value is at least 18 for k in range(0, len(ref_seq_list)): # reference sequence list loop # 160205 : No need to search other genomes # but fixed, need better implementation (remove ref seq loop) if name_list_index != k: continue # 150907 : No need to loop arm extension? miREAP only uses const FLANK var (10) # disabling arm extension loop has no significant difference, but can reduce time complexity for i in range(ARM_EXTEND_THRESHOLD, ARM_EXTEND_THRESHOLD+1): # arm extension loop (disabled) for j in range(int(line_split[4])-int(line_split[3]), DISTANCE_THRESHOLD+int(line_split[4])-int(line_split[3]), RNAFOLD_STEP): # distance loop # Assuming -5p mature sequence start = int(line_split[3])-i end = int(line_split[4])+j+i if start >= 0 and end < len(ref_seq_list[k]): # continue only if both indices are valid if line_split[5] == "+": rna_fold_seq = ref_seq_list[k][start:end] elif line_split[5] == "-": rna_fold_seq = SeqModule.create_star(ref_seq_list[k][start:end]) if "N" in rna_fold_seq: continue rnafold = subprocess.Popen([RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output = rnafold.communicate(rna_fold_seq)[0].split() # Discard non-canonical (i.e. "hard to identify") precursor pc_structure_left = output[1].strip("\n")[0:len(output[1].strip("\n"))/2] pc_structure_right = output[1].strip("\n")[len(output[1].strip("\n"))/2:len(output[1].strip("\n"))] num_open_left = pc_structure_left.count("(") num_close_left = pc_structure_left.count(")") num_open_right = pc_structure_right.count("(") num_close_right = pc_structure_right.count(")") if num_open_left == 0 or num_close_right == 0: continue if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\ float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR: continue abs_energy = re.findall(r'\d*\.\d*', str(output[2])) if abs_energy != []: if float(abs_energy[0]) >= MIN_ABS_MFE: norm_abs_energy = float(abs_energy[0])/len(rna_fold_seq) if pc_seq == []: # bad implementation, need to repair pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy elif norm_abs_energy > pc_norm_abs_energy: pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy # Assuming -3p mature sequence start = int(line_split[3])-j-i end = int(line_split[4])+i if start >= 0 and end < len(ref_seq_list[k]): # continue only if both indices are valid if line_split[5] == "+": rna_fold_seq = ref_seq_list[k][start:end] elif line_split[5] == "-": rna_fold_seq = SeqModule.create_star(ref_seq_list[k][start:end]) if "N" in rna_fold_seq: continue rnafold = subprocess.Popen([RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP"], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output = rnafold.communicate(rna_fold_seq)[0].split() # Discard non-canonical (i.e. "hard to identify") precursor pc_structure_left = output[1].strip("\n")[0:len(output[1].strip("\n"))/2] pc_structure_right = output[1].strip("\n")[len(output[1].strip("\n"))/2:len(output[1].strip("\n"))] num_open_left = pc_structure_left.count("(") num_close_left = pc_structure_left.count(")") num_open_right = pc_structure_right.count("(") num_close_right = pc_structure_right.count(")") if num_open_left == 0 or num_close_right == 0: continue if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\ float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR: continue abs_energy = re.findall(r'\d*\.\d*', str(output[2])) if abs_energy != []: if float(abs_energy[0]) >= MIN_ABS_MFE: norm_abs_energy = float(abs_energy[0])/len(rna_fold_seq) if pc_seq == []: # bad implementation, need to repair pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy elif norm_abs_energy > pc_norm_abs_energy: pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy if pc_seq != "": output_precursor_infolist.append(lines[z].strip()+"\t"+str(pc_abs_energy)+"\t"+str(pc_norm_abs_energy)+"\t"+ str(pc_start)+"\t"+str(pc_end)+"\n") output_precursor_dblist.append(pc_seq+"\n"+pc_structure+"\n") continue # create counters (subclass of dict) to "merge" partial length distribution dicts later length_distribution_counter = Counter(length_distribution_partial) return output_precursor_infolist, output_precursor_dblist, reads_total_partial, length_distribution_counter
else: smrna_file_path = os.path.join(os.getcwd(), "smrna.fa") print("Mapping smrna-seq to reference genome with bowtie...") bowtie = subprocess.Popen([bowtie_path, str(ref_file.name), "-f", smrna_file_path, os.path.join(path, "map_bowtie"), "-v", "0", "-m", str(MAX_MULTIPLE_LOCI), "-a", "-t", "-p", str(NUM_THREADS)], stdin=subprocess.PIPE, stdout=subprocess.PIPE) bowtie.wait() print("Converting bowtie map format to correct map format...") # open bowtie-generated map file (read-only, no need to be changed) output_bowtie = open(os.path.join(path, "map_bowtie"), "r") # convert bowtie map file format to correct form SeqModule.convert_bowtie_output(output_bowtie, output_map) output_map.seek(0, 0) print("Generating count data using map file...") # generate count data using map file ref_count_dump_pos, ref_count_dump_neg = SeqModule.count_generator(ref_name_list, output_map) output_map.seek(0, 0) # dump count data file for future usage and skip mapping cPickle.dump(ref_count_dump_pos, output_count_pos, -1) cPickle.dump(ref_count_dump_neg, output_count_neg, -1) output_count_pos.seek(0, 0) output_count_neg.seek(0, 0) end = time.time() print("Elapsed time for mapping : " + str(end - start) + " seconds") print("Mapping done")
def precursor_generator(lines): output_precursor_infolist = [] output_precursor_dblist = [] reads_total_partial = 0 length_distribution_partial = count_list({}) for z in range(0, len(lines)): line_split = lines[z].split() # Rare occasion of improper line data : should skip it if len(line_split) != 7: continue # accumulate raw rna seq read counts for calculation of RPM reads_total_partial += int(line_split[1]) # accumulate length distribution imformation # specify 5' end and add to the corresponding index seq_dist_check = line_split[6] five_prime = seq_dist_check[0] seq_length = len(seq_dist_check) dict_key = str(seq_length) + str(five_prime) length_distribution_partial[dict_key] += int(line_split[1]) # Screen for Drosha / Dicer cutting sites (Inspired by miREAP) qualified_flag = 1 name_list_index = ref_name_list.index(line_split[2]) count = 0 if line_split[5] == "+": count = ref_count_list_pos[name_list_index][int(line_split[3])] if count < 3: continue count_region = count count_sites = count for i in range(1, 20): if int(line_split[3]) - i < 0 or int(line_split[3]) + i >= len( ref_seq_list[name_list_index]): continue if ref_count_list_pos[name_list_index][int(line_split[3])-i] > count \ or ref_count_list_pos[name_list_index][int(line_split[3])+i] > count: qualified_flag = 0 break count_region += ref_count_list_pos[name_list_index][ int(line_split[3]) - i] count_region += ref_count_list_pos[name_list_index][ int(line_split[3]) + i] if i < 3: count_sites += ref_count_list_pos[name_list_index][ int(line_split[3]) - i] count_sites += ref_count_list_pos[name_list_index][ int(line_split[3]) + i] if float(count_sites) / count_region < DOMINANT_FACTOR or float( count) / count_sites < DOMINANT_FACTOR / 2.0: qualified_flag = 0 elif line_split[5] == "-": count = ref_count_list_neg[name_list_index][int(line_split[4])] if count < 3: continue count_region = count count_sites = count for i in range(1, 20): if int(line_split[4]) - i < 0 or int(line_split[4]) + i >= len( ref_seq_list[name_list_index]): continue if ref_count_list_neg[name_list_index][int(line_split[4])-i] > count \ or ref_count_list_neg[name_list_index][int(line_split[4])+i] > count: qualified_flag = 0 break count_region += ref_count_list_neg[name_list_index][ int(line_split[4]) - i] count_region += ref_count_list_neg[name_list_index][ int(line_split[4]) + i] if i < 3: count_sites += ref_count_list_neg[name_list_index][ int(line_split[4]) - i] count_sites += ref_count_list_neg[name_list_index][ int(line_split[4]) + i] if float(count_sites) / count_region < DOMINANT_FACTOR or float( count) / count_sites < DOMINANT_FACTOR / 2.0: qualified_flag = 0 if qualified_flag == 0: continue # Precursor Candidate Information Variable List pc_seq = "" pc_structure = "" pc_start = 0 pc_end = 0 pc_abs_energy = 0 pc_norm_abs_energy = 0 # Find min. MFE of fold structure and save it to result_precursor # WARNING : if abs. of calculated free energy is less than 10, output[2] does not contain proper value # Skipping this precursor line is proper, since threshold value is at least 18 for k in range(0, len(ref_seq_list)): # reference sequence list loop # 160205 : No need to search other genomes # but fixed, need better implementation (remove ref seq loop) if name_list_index != k: continue # 150907 : No need to loop arm extension? miREAP only uses const FLANK var (10) # disabling arm extension loop has no significant difference, but can reduce time complexity for i in range(ARM_EXTEND_THRESHOLD, ARM_EXTEND_THRESHOLD + 1): # arm extension loop (disabled) for j in range( int(line_split[4]) - int(line_split[3]), DISTANCE_THRESHOLD + int(line_split[4]) - int(line_split[3]), RNAFOLD_STEP): # distance loop # Assuming -5p mature sequence start = int(line_split[3]) - i end = int(line_split[4]) + j + i if start >= 0 and end < len( ref_seq_list[k] ): # continue only if both indices are valid if line_split[5] == "+": rna_fold_seq = ref_seq_list[k][start:end] elif line_split[5] == "-": rna_fold_seq = SeqModule.create_star( ref_seq_list[k][start:end]) if "N" in rna_fold_seq: continue rnafold = subprocess.Popen([ RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP" ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output = rnafold.communicate(rna_fold_seq)[0].split() # Discard non-canonical (i.e. "hard to identify") precursor pc_structure_left = output[1].strip( "\n")[0:len(output[1].strip("\n")) / 2] pc_structure_right = output[1].strip( "\n")[len(output[1].strip("\n")) / 2:len(output[1].strip("\n"))] num_open_left = pc_structure_left.count("(") num_close_left = pc_structure_left.count(")") num_open_right = pc_structure_right.count("(") num_close_right = pc_structure_right.count(")") if num_open_left == 0 or num_close_right == 0: continue if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\ float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR: continue abs_energy = re.findall(r'\d*\.\d*', str(output[2])) if abs_energy != []: if float(abs_energy[0]) >= MIN_ABS_MFE: norm_abs_energy = float( abs_energy[0]) / len(rna_fold_seq) if pc_seq == []: # bad implementation, need to repair pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy elif norm_abs_energy > pc_norm_abs_energy: pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy # Assuming -3p mature sequence start = int(line_split[3]) - j - i end = int(line_split[4]) + i if start >= 0 and end < len( ref_seq_list[k] ): # continue only if both indices are valid if line_split[5] == "+": rna_fold_seq = ref_seq_list[k][start:end] elif line_split[5] == "-": rna_fold_seq = SeqModule.create_star( ref_seq_list[k][start:end]) if "N" in rna_fold_seq: continue rnafold = subprocess.Popen([ RNAfold_path, "--noconv", "-d2", "--noPS", "--noLP" ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) output = rnafold.communicate(rna_fold_seq)[0].split() # Discard non-canonical (i.e. "hard to identify") precursor pc_structure_left = output[1].strip( "\n")[0:len(output[1].strip("\n")) / 2] pc_structure_right = output[1].strip( "\n")[len(output[1].strip("\n")) / 2:len(output[1].strip("\n"))] num_open_left = pc_structure_left.count("(") num_close_left = pc_structure_left.count(")") num_open_right = pc_structure_right.count("(") num_close_right = pc_structure_right.count(")") if num_open_left == 0 or num_close_right == 0: continue if float(num_close_left)/num_open_left > NON_CANONICAL_PREC_FACTOR or\ float(num_open_right)/num_close_right > NON_CANONICAL_PREC_FACTOR: continue abs_energy = re.findall(r'\d*\.\d*', str(output[2])) if abs_energy != []: if float(abs_energy[0]) >= MIN_ABS_MFE: norm_abs_energy = float( abs_energy[0]) / len(rna_fold_seq) if pc_seq == []: # bad implementation, need to repair pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy elif norm_abs_energy > pc_norm_abs_energy: pc_seq = output[0].strip() pc_structure = output[1].strip("\n") pc_start = start pc_end = end pc_abs_energy = float(abs_energy[0]) pc_norm_abs_energy = norm_abs_energy if pc_seq != "": output_precursor_infolist.append(lines[z].strip() + "\t" + str(pc_abs_energy) + "\t" + str(pc_norm_abs_energy) + "\t" + str(pc_start) + "\t" + str(pc_end) + "\n") output_precursor_dblist.append(pc_seq + "\n" + pc_structure + "\n") continue # create counters (subclass of dict) to "merge" partial length distribution dicts later length_distribution_counter = Counter(length_distribution_partial) return output_precursor_infolist, output_precursor_dblist, reads_total_partial, length_distribution_counter
bowtie_path, str(ref_file.name), "-f", smrna_file_path, os.path.join(path, "map_bowtie"), "-v", "0", "-m", str(MAX_MULTIPLE_LOCI), "-a", "-t", "-p", str(NUM_THREADS), '--large-index' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE) bowtie.wait() print("Converting bowtie map format to correct map format...") # open bowtie-generated map file (read-only, no need to be changed) output_bowtie = open(os.path.join(path, "map_bowtie"), "r") # convert bowtie map file format to correct form SeqModule.convert_bowtie_output(output_bowtie, output_map) output_map.seek(0, 0) print("Generating count data using map file...") # generate count data using map file ref_count_dump_pos, ref_count_dump_neg = SeqModule.count_generator( ref_name_list, output_map) output_map.seek(0, 0) # dump count data file for future usage and skip mapping cPickle.dump(ref_count_dump_pos, output_count_pos, -1) cPickle.dump(ref_count_dump_neg, output_count_neg, -1) output_count_pos.seek(0, 0) output_count_neg.seek(0, 0) end = time.time() print("Elapsed time for mapping : " + str(end - start) + " seconds")