def fastq_single_core(args): k = args.k q_threshold = args.quality_threshold error_rates = [] read_array = [] for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r'))): if i % 10000 == 0: print(i, "reads processed.") # skip very short reads or degenerate reads seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq)) if len(seq) < 2*k or len(seq_hpol_comp) < args.k: continue ######################## exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k) p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1)) score = p_no_error_in_kmers * (len(seq) - k +1) ## For (inferred) average error rate only, based on quality values ### These values are used in evaluations in the paper only, and are not used in clustering poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)]) error_rate = poisson_mean/float(len(qual)) if 10*-math.log(error_rate, 10) <= q_threshold: # print("Filtered read with:", 10*-math.log(error_rate, 10), error_rate) continue error_rates.append(error_rate) ############################################## read_array.append((acc, seq, qual, score) ) read_array.sort(key=lambda x: x[3], reverse=True) return read_array, error_rates
def print_read_categories(reads_unindexed, reads_indexed, reads, outfolder, SAM_file): reads_to_align = open( os.path.join(outfolder, "reads_after_genomic_filtering.fasta"), "w") unindexed_aligned = pysam.AlignmentFile(os.path.join( outfolder, "unindexed.sam"), "w", template=SAM_file) indexed_aligned = pysam.AlignmentFile(os.path.join(outfolder, "indexed.sam"), "w", template=SAM_file) for acc, (seq, _) in help_functions.readfq(open(reads, "r")): if acc in reads_unindexed: read = reads_unindexed[acc] read.set_tag('XA', "") read.set_tag('XC', "uLTRA_unindexed") unindexed_aligned.write(read) else: reads_to_align.write(">{0}\n{1}\n".format(acc, seq)) if acc in reads_indexed: read = reads_indexed[acc] indexed_aligned.write(read) unindexed_aligned.close() indexed_aligned.close() reads_to_align.close() return reads_to_align.name
def read_barcodes(primer_file): barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))} for acc, seq in list(barcodes.items()): print(acc, seq,acc[:-3]) barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper()) print(barcodes) return barcodes
def polish_sequences(centers, args): print("Saving spoa references to files:", os.path.join(args.outfolder, "consensus_reference_X.fasta")) # printing output from spoa and grouping reads # to_polishing = [] if args.medaka: polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*") elif args.racon: polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*") for folder in glob.glob(polishing_pattern): shutil.rmtree(folder) spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*") for file in glob.glob(spoa_pattern): os.remove(file) for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers): # print('lol',c_id,center) spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id)) f = open(spoa_center_file, "w") f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center)) f.close() all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id)) f = open(all_reads_file, "w") for fasta_file in all_reads: reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))} for acc, (seq, qual) in reads.items(): f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual)) f.close() # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) ) if args.medaka: print("running medaka on spoa reference {0}.".format(c_id)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model) print("Saving medaka reference to file:", os.path.join(args.outfolder, "medaka_cl_id_{0}/consensus.fasta".format(c_id))) l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines() center_polished = l[1].strip() centers[i][2] = center_polished elif args.racon: print("running racon on spoa reference {0}.".format(c_id)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter) print("Saving racon reference to file:", os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id))) l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines() center_polished = l[1].strip() centers[i][2] = center_polished f.close() return centers
def fastq_parallel(args): k = args.k q_threshold = args.quality_threshold error_rates = [] reads = [(acc, seq, qual) for acc, (seq, qual) in help_functions.readfq(open(args.fastq, 'r'))] start = time() read_chunk_size = int(len(reads) / args.nr_cores) + 1 read_batches = [b for b in batch(reads, read_chunk_size)] del reads ####### parallelize alignment ######### # pool = Pool(processes=mp.cpu_count()) original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGINT, original_sigint_handler) mp.set_start_method('spawn') print(mp.get_context()) print("Environment set:", mp.get_context()) print("Using {0} cores.".format(args.nr_cores)) start_multi = time() pool = Pool(processes=int(args.nr_cores)) try: print([len(b) for b in read_batches]) data = [ { i: (b, k, q_threshold) } for i, b in enumerate(read_batches) ] #[ {i+1 :((cluster_batches[i], cluster_seq_origin_batches[i], read_batches[i], p_emp_probs, lowest_batch_index_db[i], i+1, args), {})} for i in range(len(read_batches))] res = pool.map_async(calc_score_new, data) score_results = res.get( 999999999 ) # Without the timeout this blocking call ignores all signals. except KeyboardInterrupt: print("Caught KeyboardInterrupt, terminating workers") pool.terminate() sys.exit() else: pool.close() pool.join() print("Time elapesd multiprocessing:", time() - start_multi) read_array, error_rates = [], [] for output_dict in score_results: for k, v in output_dict.items(): r_a, err_rates = v print("Batch index", k) for item in r_a: read_array.append(item) for item2 in err_rates: error_rates.append(item2) read_array.sort(key=lambda x: x[3], reverse=True) error_rates.sort() return read_array, error_rates
def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, work_dir, abundance_cutoff, args): centers = [] reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(sorted_reads_fastq_file, 'r'))} for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True): reads_path = open(os.path.join(work_dir, "reads_c_id_{0}.fq".format(c_id)), "w") nr_reads_in_cluster = len(all_read_acc) # print("nr_reads_in_cluster", nr_reads_in_cluster) if nr_reads_in_cluster >= abundance_cutoff: for acc in all_read_acc: seq, qual = reads[acc] reads_path.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual)) # reads_path.write(">{0}\n{1}\n".format(str(q_id)+str(pos1)+str(pos2), seq)) reads_path.close() # spoa_ref = create_augmented_reference.run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa") print("creating center of {0} sequences.".format(nr_reads_in_cluster)) center = run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa") centers.append( [nr_reads_in_cluster, c_id, center, reads_path.name]) return centers
def main(args): # start = time() all_reads = { i : (acc, seq, qual) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r')))} eprint("Total cluster of {0} reads.".format(len(all_reads))) if len(all_reads) <= args.exact_instance_limit: args.exact = True if args.set_w_dynamically: args.w = args.k + min(7, int( len(all_reads)/500)) eprint("ARGUMENT SETTINGS:") for key, value in args.__dict__.items(): eprint("{0}: {1}".format(key, value)) # setattr(self, key, value) eprint() work_dir = tempfile.mkdtemp() print("Temporary workdirektory:", work_dir) anchors_to_read_acc = {} k_size = args.k for batch_id, reads in enumerate(batch(all_reads, args.max_seqs)): print("correcting {0} reads in a batch".format(len(reads))) batch_start_time = time() w = args.w x_high = args.xmax x_low = args.xmin hash_fcn = "lex" minimizer_database = get_minimizers_and_positions(reads, w, k_size, hash_fcn) minimizer_combinations_database = get_minimizer_combinations_database(reads, minimizer_database, k_size, x_low, x_high) # print(minimizer_database) if args.verbose: eprint("done creating minimizer combinations") tot_corr = 0 previously_corrected_regions = defaultdict(list) tmp_cnt = 0 for r_id in sorted(reads): #, reverse=True): read_min_comb = [ ((m1,p1), m1_curr_spans) for (m1,p1), m1_curr_spans in minimizers_comb_iterator(minimizer_database[r_id], k_size, x_low, x_high)] # print(read_min_comb) # sys.exit() if args.exact: previously_corrected_regions = defaultdict(list) # stored_calculated_regions = defaultdict(list) # = stored_calculated_regions[r_id] corr_pos = [] (acc, seq, qual) = reads[r_id] # print("starting correcting:", seq) # print(r_id, sorted(previously_corrected_regions[r_id], key=lambda x:x[1])) read_previously_considered_positions = set([tmp_pos for tmp_p1, tmp_p2, w_tmp, _ in previously_corrected_regions[r_id] for tmp_pos in range(tmp_p1, tmp_p2)]) if args.verbose: if read_previously_considered_positions: eprint("not corrected:", [ (p1_, p2_) for p1_, p2_ in zip(sorted(read_previously_considered_positions)[:-1], sorted(read_previously_considered_positions)[1:]) if p2_ > p1_ + 1 ] ) else: eprint("not corrected: entire read", ) if previously_corrected_regions[r_id]: read_previously_considered_positions = set([tmp_pos for tmp_p1, tmp_p2, w_tmp, _ in previously_corrected_regions[r_id] for tmp_pos in range(tmp_p1, tmp_p2)]) group_id = 0 pos_group = {} sorted_corr_pos = sorted(read_previously_considered_positions) for p1, p2 in zip(sorted_corr_pos[:-1], sorted_corr_pos[1:]): if p2 > p1 + 1: pos_group[p1] = group_id group_id += 1 pos_group[p2] = group_id else: pos_group[p1] = group_id if p2 == p1 + 1: pos_group[p2] = group_id else: read_previously_considered_positions= set() pos_group = {} already_computed = {} read_complexity_cnt = 0 # test_cnt = 0 # old_cnt = 0 # test_cnt2 = 0 all_intervals = [] # prev_visited_intervals = [] for (m1,p1), m1_curr_spans in read_min_comb: # If any position is not in range of current corrections: then correct, not just start and stop not_prev_corrected_spans = [(m2,p2) for (m2,p2) in m1_curr_spans if not (p1 + k_size in read_previously_considered_positions and p2 - 1 in read_previously_considered_positions) ] set_not_prev = set(not_prev_corrected_spans) not_prev_corrected_spans2 = [(m2,p2) for (m2,p2) in m1_curr_spans if (m2,p2) not in set_not_prev and (p1 + k_size in pos_group and p2 - 1 in pos_group and pos_group[p1 + k_size] != pos_group[p2 - 1]) ] not_prev_corrected_spans += not_prev_corrected_spans2 if not_prev_corrected_spans: # p1 + k_size not in read_previously_considered_positions: tmp_cnt, read_complexity_cnt = find_most_supported_span(r_id, m1, p1, not_prev_corrected_spans, minimizer_combinations_database, reads, all_intervals, k_size, tmp_cnt, read_complexity_cnt, already_computed) # sys.exit() if args.verbose: print("{0} edlib invoked due to repeated anchors for this read.".format(read_complexity_cnt)) print(tmp_cnt, "total computed editdist.") eprint("Correcting read", r_id) # add prev_visited_intervals to intervals to consider # all_intervals.extend(prev_visited_intervals) if previously_corrected_regions[r_id]: # add previously corrected regions in to the solver all_intervals.extend(previously_corrected_regions[r_id]) del previously_corrected_regions[r_id] if not all_intervals: # eprint("Found nothing to correct") corrected_seq = seq else: all_intervals_sorted_by_finish = sorted(all_intervals, key = lambda x: x[1]) opt_indicies = solve_WIS2(all_intervals_sorted_by_finish) # solve Weighted Interval Scheduling here to find set of best non overlapping intervals to correct over sol = [] prev_stop = 0 for j in opt_indicies: start, stop, weights, instance = all_intervals_sorted_by_finish[j] sol.append( ( instance["curr_read"][0][:k_size], instance["curr_read"][0][-k_size:] ) ) if start - k_size > prev_stop and prev_stop > 0: if verbose: eprint("Gap in correction:", start-k_size - prev_stop, "between positions:", prev_stop, start, ) prev_stop = stop + k_size hashable_sol = tuple(sol) if hashable_sol not in anchors_to_read_acc: anchors_to_read_acc[hashable_sol] = set() anchors_to_read_acc[hashable_sol].add(acc) else: anchors_to_read_acc[hashable_sol].add(acc) print("processed:", r_id, len(anchors_to_read_acc), len(hashable_sol), hashable_sol) eprint( "Number of unique transcripts (based on anchor solution):", len(anchors_to_read_acc)) for anchors, read_set in anchors_to_read_acc.items(): print(anchors, read_set) eprint( "Number of unique transcripts (based on anchor solution):", len(anchors_to_read_acc))