def print_intermediate_results(clusters, cluster_seq_origin, args, iter_nr): path = args.outfolder + "/{0}".format(iter_nr) help_functions.mkdir_p(path) outfile = open(os.path.join(path, "pre_clusters.csv"), "w") nontrivial_cluster_index = 0 for c_id, all_read_acc in sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True): for r_acc in all_read_acc: outfile.write("{0}\t{1}\n".format( c_id, "_".join([item for item in r_acc.split("_")[:-1]]))) if len(all_read_acc) > 1: nontrivial_cluster_index += 1 print("Nr clusters larger than 1:", nontrivial_cluster_index ) #, "Non-clustered reads:", len(archived_reads)) print("Nr clusters (all):", len(clusters)) #, "Non-clustered reads:", len(archived_reads)) origins_outfile = open(os.path.join(path, "cluster_origins.csv"), "w") for cl_id, all_read_acc in sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True): read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = cluster_seq_origin[ cl_id] origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format( read_cl_id, acc, c_seq, c_qual, score, error_rate)) outfile.close() origins_outfile.close()
def polish_sequences(centers, args): print("Saving spoa references to files:", os.path.join(args.outfolder, "consensus_reference_X.fasta")) # printing output from spoa and grouping reads # to_polishing = [] if args.medaka: polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*") elif args.racon: polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*") for folder in glob.glob(polishing_pattern): shutil.rmtree(folder) spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*") for file in glob.glob(spoa_pattern): os.remove(file) for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers): # print('lol',c_id,center) spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id)) f = open(spoa_center_file, "w") f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center)) f.close() all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id)) f = open(all_reads_file, "w") for fasta_file in all_reads: reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))} for acc, (seq, qual) in reads.items(): f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual)) f.close() # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) ) if args.medaka: print("running medaka on spoa reference {0}.".format(c_id)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model) print("Saving medaka reference to file:", os.path.join(args.outfolder, "medaka_cl_id_{0}/consensus.fasta".format(c_id))) l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines() center_polished = l[1].strip() centers[i][2] = center_polished elif args.racon: print("running racon on spoa reference {0}.".format(c_id)) # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing: polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id)) help_functions.mkdir_p(polishing_outfolder) run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter) print("Saving racon reference to file:", os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id))) l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines() center_polished = l[1].strip() centers[i][2] = center_polished f.close() return centers
logfile.close() print("Sorted all reads in {0} seconds.".format(time() - start) ) return reads_sorted_outfile.name if __name__ == '__main__': parser = argparse.ArgumentParser(description="Evaluate pacbio IsoSeq transcripts.") parser.add_argument('--fastq', type=str, default=False, help='Path to consensus fastq file(s)') parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)') parser.add_argument('--ccs', type=str, default=False, help='Path to lima demultiplexed BAM file') parser.add_argument('--outfile', type=str, default=None, help='A fasta file with transcripts that are shared between samples and have perfect illumina support.') parser.add_argument('--k', type=int, default=15, help='kmer size') args = parser.parse_args() if (args.fastq and (args.flnc or args.ccs)): print("Either (1) only a fastq file, or (2) a ccs and a flnc file should be specified. ") sys.exit() if (args.flnc != False and args.ccs == False ) or (args.flnc == False and args.ccs != False ): print("qt-clust needs both the ccs.bam file produced by ccs and the flnc file produced by isoseq3 cluster. ") sys.exit() if len(sys.argv)==1: parser.print_help() sys.exit() path_, file_prefix = os.path.split(args.outfile) help_functions.mkdir_p(path_) main(args)