예제 #1
0
def fastq_single_core(args):
    k = args.k
    q_threshold = args.quality_threshold
    error_rates = []
    read_array = []
    for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r'))):
        if i % 10000 == 0:
            print(i, "reads processed.")

        # skip very short reads or degenerate reads
        seq_hpol_comp = ''.join(ch for ch, _ in itertools.groupby(seq))
        if len(seq) < 2*k or len(seq_hpol_comp) < args.k:
            continue
        ########################
    
        exp_errors_in_kmers = expected_number_of_erroneous_kmers(qual, k)
        p_no_error_in_kmers = 1.0 - exp_errors_in_kmers/ float((len(seq) - k +1))
        score =  p_no_error_in_kmers  * (len(seq) - k +1)
        
        ## For (inferred) average error rate only, based on quality values
        ### These values are used in evaluations in the paper only, and are not used in clustering
        poisson_mean = sum([ qual.count(char_) * D_no_min[char_] for char_ in set(qual)])
        error_rate = poisson_mean/float(len(qual))
        if 10*-math.log(error_rate, 10) <= q_threshold:
            # print("Filtered read with:", 10*-math.log(error_rate, 10), error_rate)
            continue
        error_rates.append(error_rate)
        ##############################################
        
        read_array.append((acc, seq, qual, score) )

    read_array.sort(key=lambda x: x[3], reverse=True)
    return read_array, error_rates
예제 #2
0
def print_read_categories(reads_unindexed, reads_indexed, reads, outfolder,
                          SAM_file):
    reads_to_align = open(
        os.path.join(outfolder, "reads_after_genomic_filtering.fasta"), "w")
    unindexed_aligned = pysam.AlignmentFile(os.path.join(
        outfolder, "unindexed.sam"),
                                            "w",
                                            template=SAM_file)
    indexed_aligned = pysam.AlignmentFile(os.path.join(outfolder,
                                                       "indexed.sam"),
                                          "w",
                                          template=SAM_file)

    for acc, (seq, _) in help_functions.readfq(open(reads, "r")):
        if acc in reads_unindexed:
            read = reads_unindexed[acc]
            read.set_tag('XA', "")
            read.set_tag('XC', "uLTRA_unindexed")
            unindexed_aligned.write(read)
        else:
            reads_to_align.write(">{0}\n{1}\n".format(acc, seq))
            if acc in reads_indexed:
                read = reads_indexed[acc]
                indexed_aligned.write(read)

    unindexed_aligned.close()
    indexed_aligned.close()
    reads_to_align.close()
    return reads_to_align.name
예제 #3
0
def read_barcodes(primer_file):
    barcodes = { acc + '_fw' : seq.strip() for acc, (seq, _) in help_functions.readfq(open(primer_file, 'r'))}

    for acc, seq in list(barcodes.items()):
        print(acc, seq,acc[:-3])
        barcodes[acc[:-3] + '_rc'] = reverse_complement(seq.upper())

    print(barcodes)
    return barcodes
예제 #4
0
def polish_sequences(centers, args):
    print("Saving spoa references to files:", os.path.join(args.outfolder, "consensus_reference_X.fasta"))
    # printing output from spoa and grouping reads
    # to_polishing = []
    if args.medaka:
        polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*")
    elif args.racon:
        polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*")

    for folder in glob.glob(polishing_pattern):
        shutil.rmtree(folder)

    spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*")
    for file in glob.glob(spoa_pattern):
        os.remove(file)

    for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers):
        # print('lol',c_id,center)
        spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id))
        f = open(spoa_center_file, "w")
        f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
        f.close()
        
        all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
        f = open(all_reads_file, "w")
        for fasta_file in all_reads: 
            reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
            for acc, (seq, qual) in reads.items():
                f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
        f.close()
        # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) )

        if args.medaka:
            print("running medaka on spoa reference {0}.".format(c_id))
            # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
            polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
            help_functions.mkdir_p(polishing_outfolder)
            run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model)
            print("Saving medaka reference to file:", os.path.join(args.outfolder, "medaka_cl_id_{0}/consensus.fasta".format(c_id)))   
            l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines()
            center_polished = l[1].strip()
            centers[i][2] = center_polished
        elif args.racon:
            print("running racon on spoa reference {0}.".format(c_id))
            # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
            polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
            help_functions.mkdir_p(polishing_outfolder)
            run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter)
            print("Saving racon reference to file:", os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id)))   
            l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines()
            center_polished = l[1].strip()
            centers[i][2] = center_polished

    f.close()
    return centers
예제 #5
0
def fastq_parallel(args):
    k = args.k
    q_threshold = args.quality_threshold
    error_rates = []
    reads = [(acc, seq, qual)
             for acc, (seq,
                       qual) in help_functions.readfq(open(args.fastq, 'r'))]
    start = time()
    read_chunk_size = int(len(reads) / args.nr_cores) + 1
    read_batches = [b for b in batch(reads, read_chunk_size)]
    del reads
    ####### parallelize alignment #########
    # pool = Pool(processes=mp.cpu_count())
    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGINT, original_sigint_handler)
    mp.set_start_method('spawn')
    print(mp.get_context())
    print("Environment set:", mp.get_context())
    print("Using {0} cores.".format(args.nr_cores))
    start_multi = time()
    pool = Pool(processes=int(args.nr_cores))
    try:
        print([len(b) for b in read_batches])
        data = [
            {
                i: (b, k, q_threshold)
            } for i, b in enumerate(read_batches)
        ]  #[ {i+1 :((cluster_batches[i], cluster_seq_origin_batches[i], read_batches[i], p_emp_probs, lowest_batch_index_db[i], i+1, args), {})} for i in range(len(read_batches))]
        res = pool.map_async(calc_score_new, data)
        score_results = res.get(
            999999999
        )  # Without the timeout this blocking call ignores all signals.
    except KeyboardInterrupt:
        print("Caught KeyboardInterrupt, terminating workers")
        pool.terminate()
        sys.exit()
    else:
        pool.close()
    pool.join()

    print("Time elapesd multiprocessing:", time() - start_multi)
    read_array, error_rates = [], []

    for output_dict in score_results:
        for k, v in output_dict.items():
            r_a, err_rates = v
            print("Batch index", k)
            for item in r_a:
                read_array.append(item)
            for item2 in err_rates:
                error_rates.append(item2)

    read_array.sort(key=lambda x: x[3], reverse=True)
    error_rates.sort()
    return read_array, error_rates
예제 #6
0
def form_draft_consensus(clusters, representatives, sorted_reads_fastq_file, work_dir, abundance_cutoff, args):
    centers = []
    reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(sorted_reads_fastq_file, 'r'))}
    for c_id, all_read_acc in sorted(clusters.items(), key = lambda x: (len(x[1]),representatives[x[0]][5]), reverse=True):
        reads_path = open(os.path.join(work_dir, "reads_c_id_{0}.fq".format(c_id)), "w")
        nr_reads_in_cluster = len(all_read_acc)
        # print("nr_reads_in_cluster", nr_reads_in_cluster)
        if nr_reads_in_cluster >= abundance_cutoff:
            for acc in all_read_acc:
                seq, qual = reads[acc]
                reads_path.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
                # reads_path.write(">{0}\n{1}\n".format(str(q_id)+str(pos1)+str(pos2), seq))
            reads_path.close()
            # spoa_ref = create_augmented_reference.run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
            print("creating center of {0} sequences.".format(nr_reads_in_cluster))
            center = run_spoa(reads_path.name, os.path.join(work_dir,"spoa_tmp.fa"), "spoa")
            centers.append( [nr_reads_in_cluster, c_id, center, reads_path.name])
    return centers
예제 #7
0
def main(args):
    # start = time()
    all_reads = { i : (acc, seq, qual) for i, (acc, (seq, qual)) in enumerate(help_functions.readfq(open(args.fastq, 'r')))}
    eprint("Total cluster of {0} reads.".format(len(all_reads)))
    if len(all_reads) <= args.exact_instance_limit:
        args.exact = True
    if args.set_w_dynamically:
        args.w = args.k + min(7, int( len(all_reads)/500))

    eprint("ARGUMENT SETTINGS:")
    for key, value in args.__dict__.items():
        eprint("{0}: {1}".format(key, value))
        # setattr(self, key, value)
    eprint()

    work_dir = tempfile.mkdtemp()
    print("Temporary workdirektory:", work_dir)
    anchors_to_read_acc = {}
    k_size = args.k
    for batch_id, reads in enumerate(batch(all_reads, args.max_seqs)):
        print("correcting {0} reads in a batch".format(len(reads)))
        batch_start_time = time()
        w = args.w
        x_high = args.xmax
        x_low = args.xmin
        hash_fcn = "lex"

        minimizer_database  = get_minimizers_and_positions(reads, w, k_size, hash_fcn)
        minimizer_combinations_database = get_minimizer_combinations_database(reads, minimizer_database, k_size, x_low, x_high)
        # print(minimizer_database)
        if args.verbose:
            eprint("done creating minimizer combinations")

        tot_corr = 0
        previously_corrected_regions = defaultdict(list)
        tmp_cnt = 0

        for r_id in sorted(reads): #, reverse=True):
            read_min_comb = [ ((m1,p1), m1_curr_spans) for (m1,p1), m1_curr_spans in  minimizers_comb_iterator(minimizer_database[r_id], k_size, x_low, x_high)]
            # print(read_min_comb)
            # sys.exit()
            if args.exact:
                previously_corrected_regions = defaultdict(list)
            # stored_calculated_regions = defaultdict(list)
    
            #  = stored_calculated_regions[r_id]
            corr_pos = []
            (acc, seq, qual) = reads[r_id]
            # print("starting correcting:", seq)

            # print(r_id, sorted(previously_corrected_regions[r_id], key=lambda x:x[1]))
            read_previously_considered_positions = set([tmp_pos for tmp_p1, tmp_p2, w_tmp, _ in previously_corrected_regions[r_id] for tmp_pos in range(tmp_p1, tmp_p2)])
            
            if args.verbose:
                if read_previously_considered_positions:
                    eprint("not corrected:", [ (p1_, p2_) for p1_, p2_ in zip(sorted(read_previously_considered_positions)[:-1], sorted(read_previously_considered_positions)[1:]) if p2_ > p1_ + 1 ] )
                else:
                    eprint("not corrected: entire read", )

            if previously_corrected_regions[r_id]:
                read_previously_considered_positions = set([tmp_pos for tmp_p1, tmp_p2, w_tmp, _ in previously_corrected_regions[r_id] for tmp_pos in range(tmp_p1, tmp_p2)])
                group_id = 0
                pos_group = {}
                sorted_corr_pos = sorted(read_previously_considered_positions)
                for p1, p2 in zip(sorted_corr_pos[:-1], sorted_corr_pos[1:]):
                    if p2 > p1 + 1:
                       pos_group[p1] = group_id 
                       group_id += 1
                       pos_group[p2] = group_id 
                    else:
                       pos_group[p1] = group_id 
                if p2 == p1 + 1:
                    pos_group[p2] = group_id 
            else:
                read_previously_considered_positions= set()
                pos_group = {}

            already_computed = {}
            read_complexity_cnt = 0
            # test_cnt = 0
            # old_cnt = 0
            # test_cnt2 = 0
            all_intervals = []
            # prev_visited_intervals = []

            for (m1,p1), m1_curr_spans in read_min_comb: 
                # If any position is not in range of current corrections: then correct, not just start and stop
                not_prev_corrected_spans = [(m2,p2) for (m2,p2) in m1_curr_spans if not (p1 + k_size in read_previously_considered_positions and p2 - 1 in read_previously_considered_positions) ] 
                set_not_prev = set(not_prev_corrected_spans)
                not_prev_corrected_spans2 = [(m2,p2) for (m2,p2) in m1_curr_spans if (m2,p2) not in set_not_prev and (p1 + k_size in pos_group and p2 - 1 in pos_group and pos_group[p1 + k_size] != pos_group[p2 - 1]) ] 
                not_prev_corrected_spans += not_prev_corrected_spans2


                if not_prev_corrected_spans: # p1 + k_size not in read_previously_considered_positions:
                    tmp_cnt, read_complexity_cnt = find_most_supported_span(r_id, m1, p1, not_prev_corrected_spans, minimizer_combinations_database, reads, all_intervals, k_size, tmp_cnt, read_complexity_cnt, already_computed)

            # sys.exit()
            if args.verbose:
                print("{0} edlib invoked due to repeated anchors for this read.".format(read_complexity_cnt))
                print(tmp_cnt, "total computed editdist.")
                eprint("Correcting read", r_id)

            # add prev_visited_intervals to intervals to consider
            # all_intervals.extend(prev_visited_intervals)

            if previously_corrected_regions[r_id]: # add previously corrected regions in to the solver
                all_intervals.extend(previously_corrected_regions[r_id])
                del previously_corrected_regions[r_id]

            if not all_intervals:
                # eprint("Found nothing to correct")
                corrected_seq = seq
            else:
                all_intervals_sorted_by_finish = sorted(all_intervals, key = lambda x: x[1])
                opt_indicies = solve_WIS2(all_intervals_sorted_by_finish) # solve Weighted Interval Scheduling here to find set of best non overlapping intervals to correct over
                sol = []
                prev_stop = 0
                for j in opt_indicies:
                    start, stop, weights, instance = all_intervals_sorted_by_finish[j]
                    sol.append( ( instance["curr_read"][0][:k_size], instance["curr_read"][0][-k_size:] ) )
                    
                    if start - k_size > prev_stop and prev_stop > 0:
                        if verbose:
                            eprint("Gap in correction:", start-k_size - prev_stop, "between positions:", prev_stop, start, )
                    prev_stop = stop + k_size

                hashable_sol = tuple(sol)
                if hashable_sol not in anchors_to_read_acc:
                    anchors_to_read_acc[hashable_sol] = set()
                    anchors_to_read_acc[hashable_sol].add(acc) 
                else:
                    anchors_to_read_acc[hashable_sol].add(acc) 
            print("processed:", r_id, len(anchors_to_read_acc), len(hashable_sol), hashable_sol)
                
    eprint( "Number of unique transcripts (based on anchor solution):", len(anchors_to_read_acc))
    for anchors, read_set in anchors_to_read_acc.items():
        print(anchors, read_set)        
    eprint( "Number of unique transcripts (based on anchor solution):", len(anchors_to_read_acc))