def fix_contigs(k): perfect_contig_num = 0 for contig_id in range(len(contig_find_trans_list)): all_ok = 1 # 如前文所言,没有找错的 n_of_kmer_in_curr_contig = contig_list[contig_id].n_of_kmer for each_trans_info in contig_find_trans_list[contig_id]: if each_trans_info.start_in_tran != 0 or each_trans_info.stop_in_tran != n_of_kmer_in_curr_contig: all_ok = 0 # 说明这里有找错的 if all_ok: perfect_contig_num = perfect_contig_num + 1 else: tmp_list = [] for each_trans_info in contig_find_trans_list[contig_id]: tmp_list.append(each_trans_info.start_in_tran) tmp_list.append(each_trans_info.stop_in_tran) break_points = sorted(list(set(tmp_list))) old_contig_seq = contig_list[contig_id].seq old_trans_info = contig_find_trans_list[contig_id] new_contig = Contig() for j in range(len(break_points) - 1): new_contig.seq = old_contig_seq[ break_points[j]:break_points[j + 1] - break_points[j] + k - 1] new_contig.n_of_kmer = break_points[j + 1] - break_points[j] if j == 0: new_contig.id = contig_id contig_list[contig_id] = new_contig else: new_contig.id = len(contig_list) contig_list.append(new_contig) ecs_list.append(-1) for m in range(len(new_contig.seq) - k + 1): curr_kmer_in_contig = new_contig.seq[m:m + k] curr_rep = kmer_rep(curr_kmer_in_contig) if curr_rep in kmer_str_dict: kmer_str_dict[curr_rep].contig_id = new_contig.id kmer_str_dict[curr_rep].pos_in_contig = m kmer_str_dict[ curr_rep].n_of_kmer_in_contig = new_contig.n_of_kmer else: program_stop("ecs.py-4") new_trans_info_list = [] for each_trans_info in old_trans_info: if not (each_trans_info.start_in_tran >= break_points[j + 1] or each_trans_info.stop_in_tran <= break_points[j]): new_trans_info = ContigFindTrans() new_trans_info.sense_in_tran = each_trans_info.sense_in_tran new_trans_info.tran_id = each_trans_info.tran_id new_trans_info.start_in_tran = 0 new_trans_info.stop_in_tran = new_contig.n_of_kmer new_trans_info_list.append(new_trans_info) if j == 0: contig_find_trans_list[contig_id] = new_trans_info_list else: contig_find_trans_list.append(new_trans_info_list)
def write_em_tsv(file_name, alpha_list, eff_lens): tran_name = transcript_info.tran_name tran_len = transcript_info.tran_len try: fp = open(file_name, 'w') except IOError: print("cannot open", file_name) program_stop("em.py") fp.write("tran_name\ttran_len\teff_len\tcounts" + "\n") for ec in range(len(alpha_list)): fp.write(tran_name[ec] + "\t" + str(tran_len[ec]) + "\t" + str(round(eff_lens[ec], 4)) + "\t" + str(round(alpha_list[ec], 8)) + "\n") fp.close()
def read_tran_files(file_name): seqs = [] try: if ".gz" in file_name: fp = gzip.open(file_name, 'rt') else: fp = open(file_name, 'r') except IOError: print(file_name, "is not exist!") program_stop("read_tran.py") for seq_record in SeqIO.parse(fp, "fasta"): transcript_info.tran_name.append(seq_record.id) transcript_info.tran_len.append(len(seq_record.seq)) seq = replace_base(seq_record.seq) seq = check_poly_a_tail(seq) seqs.append(seq) transcript_info.tran_num = len(transcript_info.tran_name) fp.close() return seqs
def start(args): if not os.path.exists(args.index_path): print("the index_file is not exist.please check.") program_stop("quant.py") else: begin = datetime.datetime.now() load_k = load_idx(args.index_path) end = datetime.datetime.now() print("load_idx:", end - begin) if args.kmer_len != load_k: print("the k value is inconsistent.") program_stop("quant.py") def_args.k = args.kmer_len def_args.index = args.index_path if args.single_mode: def_args.len_frag = args.fragment_len def_args.sd = args.sd_len def_args.fq_file_single = args.fa_or_fq_file if not os.path.exists(def_args.fq_file_single): print("fq_or_fa_files is required.please check.") program_stop("quant.py") else: def_args.fq_file_1 = args.fa_or_fq_files_1 def_args.fq_file_2 = args.fa_or_fq_files_2 if not (os.path.exists(def_args.fq_file_1) or os.path.exists(def_args.fq_file_2)): print("fq_or_fa_files is required.please check.") program_stop("quant.py") output_path_list = os.path.split(args.output_path) if len(output_path_list[0]) == 0: output_path = os.path.join(os.getcwd(), output_path_list[1]) else: output_path = args.output_path if not os.path.exists(output_path): os.makedirs(output_path) def_args.output = output_path def_args.threads = args.threads def_args.single_mode = args.single_mode read_fq_fa_files(def_args)
def start(args): if args.kmer_len % 2 == 0: print("k value must be odd.") program_stop("index.py") if args.input_fna_path == "null": print("the input_fna_path is required.") program_stop("index.py") else: if not os.path.isfile(args.input_fna_path): print("the input_fna_path is wrong.please check.") program_stop("index.py") index_out_dir = os.path.split(args.index_path) if len(index_out_dir[0]) == 0: index_path = os.path.join(os.getcwd(), index_out_dir[1]) else: if not os.path.exists(index_out_dir[0]): print("the index_out_DIR is not exist.please check.") program_stop("index.py") else: index_path = args.index_path def_args.k = args.kmer_len def_args.index = index_path def_args.fna_file = args.input_fna_path begin = datetime.datetime.now() seqs = read_tran_files(def_args.fna_file) end = datetime.datetime.now() print("read_tran:", end - begin) begin = datetime.datetime.now() build_dbg(seqs, def_args.k) end = datetime.datetime.now() print("build_contig:", end - begin) begin = datetime.datetime.now() build_ecs(seqs, def_args.k) end = datetime.datetime.now() print("build_ecs:", end - begin) begin = datetime.datetime.now() write_idx(def_args.index, def_args.k) end = datetime.datetime.now() print("write_idx:", end - begin)
def read_fq_fa_files(args): global k, f_len_goal k = args.k pool = Pool(processes=args.threads) begin = datetime.datetime.now() if args.single_mode: file_name = args.fq_file_single file_suffix = os.path.splitext(file_name)[1] try: if ".gz" == file_suffix: fp = gzip.open(file_name, 'rt') file_suffix_suffix = os.path.splitext(os.path.splitext(file_name)[0])[1] else: fp = open(file_name, 'r') file_suffix_suffix = file_suffix except IOError: print(file_name, "is not exist!") program_stop("process_read.py") if ".fasta" == file_suffix_suffix or ".fa" == file_suffix_suffix: res = pool.map(do_something_with_record, to_fasta(fp)) pool.close() pool.join() elif ".fastq" == file_suffix_suffix or ".fq" == file_suffix_suffix: res = pool.map(do_something_with_record, convert_to_fasta(fp)) pool.close() pool.join() else: print("cannot find .fa(.fasta) or .fq(.fastq).please check.") program_stop("process_read.py") print(len(res)) print("please waite single mode.") else: # paired_mode file_name_1 = args.fq_file_1 file_suffix_1 = os.path.splitext(file_name_1)[1] file_name_2 = args.fq_file_2 file_suffix_2 = os.path.splitext(file_name_2)[1] try: if ".gz" in file_name_1: fp1 = gzip.open(file_name_1, 'rt') file_suffix_suffix_1 = os.path.splitext(os.path.splitext(file_name_1)[0])[1] else: fp1 = open(file_name_1, 'r') file_suffix_suffix_1 = file_suffix_1 if ".gz" in file_name_2: fp2 = gzip.open(file_name_2, 'rt') file_suffix_suffix_2 = os.path.splitext(os.path.splitext(file_name_2)[0])[1] else: fp2 = open(file_name_2, 'r') file_suffix_suffix_2 = file_suffix_2 except IOError: print(file_name_1, "or", file_name_2, "is not exist!") program_stop("process_read.py") if file_suffix_suffix_1 != file_suffix_suffix_2: print("suffix is not inconsistent.please check.") program_stop("process_read.py") else: if ".fasta" == file_suffix_suffix_1 or ".fa" == file_suffix_suffix_1: res = pool.starmap(do_something_with_record_paired, zip(to_fasta(fp1), to_fasta(fp2))) pool.close() pool.join() elif ".fastq" == file_suffix_suffix_1 or ".fq" == file_suffix_suffix_1: res = pool.starmap(do_something_with_record_paired, zip(convert_to_fasta(fp1), convert_to_fasta(fp2))) pool.close() pool.join() else: print("cannot find .fa(.fasta) or .fq(.fastq).please check.") program_stop("process_read.py") fp1.close() fp2.close() end = datetime.datetime.now() print("process_reads:", end - begin) begin = datetime.datetime.now() for each_u, each_tl in res: ec = find_ec(each_u) if ec == -1 or ec >= len(counts): new_ecs.append(each_u) else: counts[ec] = counts[ec] + 1 if f_len_goal > 0 and 0 <= ec < transcript_info.tran_num: if 0 < each_tl < len(f_lens): f_lens[each_tl] = f_lens[each_tl] + 1 f_len_goal = f_len_goal - 1 res.clear() print("counts:", counts) end = datetime.datetime.now() print("match_ecs:", end - begin) begin = datetime.datetime.now() mean_f_lens = compute_mean_flg_lens(f_lens) f_lens.clear() tran_lens_estimated = get_each_tran_len(mean_f_lens) mean_f_lens.clear() alpha_list, eff_lens = em_run(counts, tran_lens_estimated) output_file = os.path.join(args.output, "dualisto_quant.tsv") write_em_tsv(output_file, alpha_list, eff_lens) end = datetime.datetime.now() print("em_run:", end - begin)
def build_ecs(seqs, k): contig_find_trans_dict = {} for i in range(len(contig_list)): contig_find_trans_dict[i] = [] # 遍历基因序列 for i in range(len(seqs)): each_ec = [i] ec_map.append(each_ec) ec_inv_dict[i] = each_ec seq = seqs[i] n_of_kmers_of_seq = len(seq) - k + 1 j = 0 while j < n_of_kmers_of_seq: curr_kmer = seq[j:j + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) curr_twin = kmer_twin(curr_kmer) if curr_rep in kmer_str_dict: curr_contig_id = kmer_str_dict[curr_rep].contig_id curr_contig = contig_list[curr_contig_id] kmers_in_contig = [] # 遍历curr_contig里面的每一个kmer for m in range(len(curr_contig.seq) - k + 1): curr_kmer_in_contig = curr_contig.seq[m:m + k] kmers_in_contig.append(curr_kmer_in_contig) kmers_in_contig_len = len(kmers_in_contig) contig_find_trans = ContigFindTrans() contig_find_trans.tran_id = i if (curr_kmer == curr_rep ) == kmer_str_dict[curr_rep].sense_in_contig: contig_find_trans.sense_in_tran = 1 contig_find_trans.start_in_tran = kmers_in_contig.index( curr_kmer) if kmers_in_contig_len - contig_find_trans.start_in_tran > n_of_kmers_of_seq - j: # 说明这个contig不来自这个seq contig_find_trans.stop_in_tran = contig_find_trans.start_in_tran + n_of_kmers_of_seq - j # [start,stop)这里这个长度肯定是错误的,正好后面要用到 j = n_of_kmers_of_seq # 到末尾,也就是结束 else: contig_find_trans.stop_in_tran = kmers_in_contig_len # contig属于这个seq,而且长度就是contig的长度 j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran # 下一个从紧接着当前contig的下一个contig开始 else: contig_find_trans.sense_in_tran = 0 contig_find_trans.stop_in_tran = kmers_in_contig.index( curr_twin) + 1 # 因为[start,stop) stop代表的是长度,取不到stop,实际取的是stop-1 if contig_find_trans.stop_in_tran > n_of_kmers_of_seq - j: contig_find_trans.start_in_tran = contig_find_trans.stop_in_tran - n_of_kmers_of_seq + j j = n_of_kmers_of_seq else: contig_find_trans.start_in_tran = 0 j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran contig_find_trans_dict[curr_contig_id].append( contig_find_trans) else: print(curr_rep) print(j) program_stop("ecs.py-2") for i in range(len(contig_list)): contig_find_trans_list.append(contig_find_trans_dict[i]) fix_contigs(k) for curr_contig_id in range(len(contig_find_trans_list)): curr_contig_list = [] for curr_trans_info in contig_find_trans_list[curr_contig_id]: curr_contig_list.append(curr_trans_info.tran_id) curr_contig_list = sorted(list(set(curr_contig_list))) ec = -1 if curr_contig_list in ec_inv_dict.values(): for (key, value) in ec_inv_dict.items(): if value == curr_contig_list: ec = key else: ec = len(ec_inv_dict) ec_inv_dict[ec] = curr_contig_list ec_map.append(curr_contig_list) ecs_list[curr_contig_id] = ec contig_list[curr_contig_id].ecs_id = ec for i in range(len(seqs)): seq = seqs[i] n_of_kmers_of_seq = len(seq) - k + 1 tmp_str = "" j = 0 while j < n_of_kmers_of_seq: curr_kmer = seq[j:j + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) if curr_rep in kmer_str_dict: curr_kmer_info = kmer_str_dict[curr_rep] contig_include_trans = ContigIncludeTrans() contig_include_trans.tran_id = i contig_include_trans.pos_in_tran = j contig_include_trans.sense_in_tran = int((( curr_kmer == curr_rep) == curr_kmer_info.sense_in_contig)) contig_list[curr_kmer_info.contig_id].include_trans.append( contig_include_trans) j = j + contig_list[curr_kmer_info.contig_id].n_of_kmer # if contig_include_trans.sense_in_tran: # if contig_include_trans.pos_in_tran == 0: # tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq # else: # tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq[k - 1] # else: # new_tm_str = str_pair(contig_list[curr_kmer_info.contig_id].seq) # if contig_include_trans.pos_in_tran == 0: # tmp_str = tmp_str + new_tm_str # else: # tmp_str = tmp_str + new_tm_str[k - 1] else: program_stop("ecs.py-3")
def load_idx(filename): try: fp = open(filename, 'r') except IOError: print(filename, "is not exist!") program_stop("index_write_load.py") # 1.read version read_version = fp.readline().strip() if read_version != kallisto_index_version: program_stop("index_write_load.py") # 2.read k k = int(fp.readline().strip()) # 3.read num of trans transcript_info.tran_num = int(fp.readline().strip()) transcript_info.tran_len = [0] * transcript_info.tran_num for i in range(transcript_info.tran_num): transcript_info.tran_len[i] = int(fp.readline().strip()) # 4.read kmer_str_dict and kmer_info_list kmer_str_list_len = int(fp.readline().strip()) print("kmer_num:", kmer_str_list_len) for i in range(kmer_str_list_len): kmer_str_list.append("") kmer_info_list.append("") i = 0 while i < kmer_str_list_len: kmer_str_list[i] = fp.readline().strip() kmer_info = KmerInfo() curr_kmer_info = fp.readline().strip().split(",") kmer_info.contig_id = int(curr_kmer_info[0]) kmer_info.pos_in_contig = int(curr_kmer_info[1]) kmer_info.n_of_kmer_in_contig = int(curr_kmer_info[2]) kmer_info.sense_in_contig = int(curr_kmer_info[3]) kmer_info_list[i] = kmer_info i = i + 1 for i in range(kmer_str_list_len): kmer_str_dict[kmer_str_list[i]] = kmer_info_list[i] kmer_str_list.clear() kmer_info_list.clear() # 5.read num of ecs ec_map_len = int(fp.readline().strip()) print("ecs_num:", ec_map_len) for i in range(ec_map_len): ec_map.append([]) ec_inv_dict[i] = [] counts.append(0) # 6.read each ecs i = 0 while i < ec_map_len: # 6.1 read num of each ecs each_ecs_len = int(fp.readline().strip()) for j in range(each_ecs_len): ec_map[i].append(0) # 6.2 read each trans in each ecs j = 0 while j < each_ecs_len: ec_map[i][j] = int(fp.readline().strip()) j = j + 1 ec_inv_dict[i] = ec_map[i] i = i + 1 # 7.read trans_names transcript_info.tran_name = [""] * transcript_info.tran_num for i in range(transcript_info.tran_num): transcript_info.tran_name[i] = fp.readline().strip() # 8.read contigs contig_list_len = int(fp.readline().strip()) print("contig_num:", contig_list_len) for i in range(contig_list_len): contig_list.append(0) ecs_list.append(0) i = 0 while i < contig_list_len: load_contig = Contig() curr_contig_info = fp.readline().strip().split(",") load_contig.id = int(curr_contig_info[0]) load_contig.n_of_kmer = int(curr_contig_info[1]) load_contig.seq = curr_contig_info[2] # 8.1 read contig_to_trans_info curr_contig_include_trans_len = int(fp.readline().strip()) load_contig.include_trans = [0] * curr_contig_include_trans_len j = 0 while j < curr_contig_include_trans_len: load_include_trans = ContigIncludeTrans() curr_include_trans = fp.readline().strip().split(",") load_include_trans.tran_id = int(curr_include_trans[0]) load_include_trans.pos_in_tran = int(curr_include_trans[1]) load_include_trans.sense_in_tran = int(curr_include_trans[2]) j = j + 1 i = i + 1 # 9.read ecs info i = 0 while i < contig_list_len: ecs_list[i] = int(fp.readline().strip()) i = i + 1 fp.close() return k
def write_idx(filename, k): try: fp = open(filename, 'w') except IOError: print(filename, "is not exist!") program_stop("index_write_load.py") # 1.write version fp.write(version + "\n") # 2.write k fp.write(str(k) + "\n") # 3.write num of trans fp.write(str(transcript_info.tran_num) + "\n") for i in range(transcript_info.tran_num): fp.write(str(transcript_info.tran_len[i]) + "\n") # 4.write kmer_str_dict and kmer_info_list for key in kmer_str_dict: kmer_str_list.append(key) kmer_info_list.append(kmer_str_dict[key]) i = 0 fp.write(str(len(kmer_str_list)) + "\n") while i < len(kmer_str_list): fp.write(kmer_str_list[i] + "\n") fp.write( str(kmer_info_list[i].contig_id) + "," + str(kmer_info_list[i].pos_in_contig) + "," + str(kmer_info_list[i].n_of_kmer_in_contig) + "," + str(kmer_info_list[i].sense_in_contig) + "\n") i = i + 1 # 5.write num of ecs fp.write(str(len(ec_map)) + "\n") # 6.write each ecs i = 0 while i < len(ec_map): # 6.1 write num of each ecs fp.write(str(len(ec_map[i])) + "\n") # 6.2 write each trans in each ecs j = 0 while j < len(ec_map[i]): fp.write(str(ec_map[i][j]) + "\n") j = j + 1 i = i + 1 # 7.write trans_names for i in range(transcript_info.tran_num): fp.write(transcript_info.tran_name[i] + "\n") # 8.write contigs fp.write(str(len(contig_list)) + "\n") i = 0 while i < len(contig_list): fp.write( str(contig_list[i].id) + "," + str(contig_list[i].n_of_kmer) + "," + str(contig_list[i].seq) + "\n") # 8.1 write contig_to_trans_info fp.write(str(len(contig_list[i].include_trans)) + "\n") j = 0 while j < len(contig_list[i].include_trans): curr_include_trans = contig_list[i].include_trans[j] fp.write( str(curr_include_trans.tran_id) + "," + str(curr_include_trans.pos_in_tran) + "," + str(curr_include_trans.sense_in_tran) + "\n") j = j + 1 i = i + 1 # 9.write ecs info i = 0 while i < len(ecs_list): fp.write(str(ecs_list[i]) + "\n") i = i + 1 fp.flush() fp.close()
def build_dbg(seqs, k): global fw_step_kmer tmp_kmer_map = set() # 把序列打断,得到kmer的集合 for seq in seqs: for i in range(len(seq) - k + 1): curr_kmer = seq[i:i + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) tmp_kmer_map.add(curr_rep) # 给每一个kmer带上一个可以存储它信息的类 for each_kmer in tmp_kmer_map: kmer_info = KmerInfo() kmer_str_dict[each_kmer] = kmer_info # 把kmer连起来,找contig for kmer_key in kmer_str_dict: curr_kmer = kmer_key curr_kmer_info = kmer_str_dict[kmer_key] if curr_kmer_info.contig_id == -1: self_loop = 0 curr_twin = kmer_twin(curr_kmer) fw_list = [curr_kmer] last_kmer = curr_kmer fw_step_kmer = curr_kmer while fwstep(fw_step_kmer): if fw_step_kmer == curr_kmer: # 该if判断是否成环 self_loop = 1 break # pass # example(3,"ACTGAC") (5,"ACCAACCA") (5,"TCTGTCTG") (5,"AACAAACA") (5,"CACACA") (5,"ACACAC") # print("begin:",begin,"fw:", fw, "t_kmer", self.tmp_kmer, "seqs:", self.seqs) elif fw_step_kmer == curr_twin: self_loop = (len(fw_list) > 1) break elif fw_step_kmer == kmer_twin(last_kmer): break fw_list.append(fw_step_kmer) last_kmer = fw_step_kmer fw_step_kmer = curr_twin bw_list = [] first_kmer = curr_twin if not self_loop: while fwstep(fw_step_kmer): if fw_step_kmer == curr_twin: break elif fw_step_kmer == curr_kmer: break elif fw_step_kmer == kmer_twin(first_kmer): break bw_list.append(fw_step_kmer) first_kmer = fw_step_kmer curr_kmer_list = [] for bw in reversed(bw_list): curr_kmer_list.append(kmer_twin(bw)) for fw in fw_list: curr_kmer_list.append(fw) contig_list_len = len(curr_kmer_list) curr_contig_id = len(contig_list) contig = Contig() contig.id = curr_contig_id contig.n_of_kmer = contig_list_len # 存储kmer的信息 for j in range(contig_list_len): each_kmer = curr_kmer_list[j] tmp_rep = kmer_rep(each_kmer) if tmp_rep in kmer_str_dict: kmer_str_dict[tmp_rep].contig_id = curr_contig_id kmer_str_dict[tmp_rep].pos_in_contig = j kmer_str_dict[ tmp_rep].n_of_kmer_in_contig = contig_list_len if tmp_rep == each_kmer: kmer_str_dict[tmp_rep].sense_in_contig = 1 else: kmer_str_dict[tmp_rep].sense_in_contig = 0 else: program_stop("contig.py") if j == 0: contig.seq = each_kmer elif j > 0: contig.seq = contig.seq + each_kmer[-1] contig_list.append(contig) ecs_list.append(-1)