예제 #1
0
def print_intermediate_results(clusters, cluster_seq_origin, args, iter_nr):
    path = args.outfolder + "/{0}".format(iter_nr)
    help_functions.mkdir_p(path)
    outfile = open(os.path.join(path, "pre_clusters.csv"), "w")
    nontrivial_cluster_index = 0
    for c_id, all_read_acc in sorted(clusters.items(),
                                     key=lambda x: len(x[1]),
                                     reverse=True):
        for r_acc in all_read_acc:
            outfile.write("{0}\t{1}\n".format(
                c_id, "_".join([item for item in r_acc.split("_")[:-1]])))
        if len(all_read_acc) > 1:
            nontrivial_cluster_index += 1
    print("Nr clusters larger than 1:", nontrivial_cluster_index
          )  #, "Non-clustered reads:", len(archived_reads))
    print("Nr clusters (all):",
          len(clusters))  #, "Non-clustered reads:", len(archived_reads))

    origins_outfile = open(os.path.join(path, "cluster_origins.csv"), "w")
    for cl_id, all_read_acc in sorted(clusters.items(),
                                      key=lambda x: len(x[1]),
                                      reverse=True):
        read_cl_id, b_i, acc, c_seq, c_qual, score, error_rate = cluster_seq_origin[
            cl_id]
        origins_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(
            read_cl_id, acc, c_seq, c_qual, score, error_rate))
    outfile.close()
    origins_outfile.close()
예제 #2
0
def polish_sequences(centers, args):
    print("Saving spoa references to files:", os.path.join(args.outfolder, "consensus_reference_X.fasta"))
    # printing output from spoa and grouping reads
    # to_polishing = []
    if args.medaka:
        polishing_pattern = os.path.join(args.outfolder, "medaka_cl_id_*")
    elif args.racon:
        polishing_pattern = os.path.join(args.outfolder, "racon_cl_id_*")

    for folder in glob.glob(polishing_pattern):
        shutil.rmtree(folder)

    spoa_pattern = os.path.join(args.outfolder, "consensus_reference_*")
    for file in glob.glob(spoa_pattern):
        os.remove(file)

    for i, (nr_reads_in_cluster, c_id, center, all_reads) in enumerate(centers):
        # print('lol',c_id,center)
        spoa_center_file = os.path.join(args.outfolder, "consensus_reference_{0}.fasta".format(c_id))
        f = open(spoa_center_file, "w")
        f.write(">{0}\n{1}\n".format("consensus_cl_id_{0}_total_supporting_reads_{1}".format(c_id, nr_reads_in_cluster), center))
        f.close()
        
        all_reads_file = os.path.join(args.outfolder, "reads_to_consensus_{0}.fastq".format(c_id))
        f = open(all_reads_file, "w")
        for fasta_file in all_reads: 
            reads = { acc : (seq, qual) for acc, (seq, qual) in help_functions.readfq(open(fasta_file, 'r'))}
            for acc, (seq, qual) in reads.items():
                f.write("@{0}\n{1}\n{2}\n{3}\n".format(acc, seq, "+", qual))
        f.close()
        # to_polishing.append( (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) )

        if args.medaka:
            print("running medaka on spoa reference {0}.".format(c_id))
            # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
            polishing_outfolder = os.path.join(args.outfolder, "medaka_cl_id_{0}".format(c_id))
            help_functions.mkdir_p(polishing_outfolder)
            run_medaka(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.medaka_model)
            print("Saving medaka reference to file:", os.path.join(args.outfolder, "medaka_cl_id_{0}/consensus.fasta".format(c_id)))   
            l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines()
            center_polished = l[1].strip()
            centers[i][2] = center_polished
        elif args.racon:
            print("running racon on spoa reference {0}.".format(c_id))
            # for (nr_reads_in_cluster, c_id, spoa_center_file, all_reads_file) in to_polishing:
            polishing_outfolder = os.path.join(args.outfolder, "racon_cl_id_{0}".format(c_id))
            help_functions.mkdir_p(polishing_outfolder)
            run_racon(all_reads_file, spoa_center_file, polishing_outfolder, "1", args.racon_iter)
            print("Saving racon reference to file:", os.path.join(args.outfolder, "racon_cl_id_{0}/consensus.fasta".format(c_id)))   
            l = open(os.path.join(polishing_outfolder, "consensus.fasta"), 'r').readlines()
            center_polished = l[1].strip()
            centers[i][2] = center_polished

    f.close()
    return centers
예제 #3
0
    logfile.close()
    print("Sorted all reads in {0} seconds.".format(time() - start) )
    return reads_sorted_outfile.name


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Evaluate pacbio IsoSeq transcripts.")
    parser.add_argument('--fastq', type=str,  default=False, help='Path to consensus fastq file(s)')
    parser.add_argument('--flnc', type=str, default=False, help='The flnc reads generated by the isoseq3 algorithm (BAM file)')
    parser.add_argument('--ccs', type=str, default=False, help='Path to lima demultiplexed BAM file')
    parser.add_argument('--outfile', type=str,  default=None, help='A fasta file with transcripts that are shared between samples and have perfect illumina support.')
    parser.add_argument('--k', type=int, default=15, help='kmer size')
    
    args = parser.parse_args()

    if (args.fastq and (args.flnc or args.ccs)):
        print("Either (1) only a fastq file, or (2) a ccs and a flnc file should be specified. ")
        sys.exit()

    if (args.flnc != False and args.ccs == False ) or (args.flnc == False and args.ccs != False ):
        print("qt-clust needs both the ccs.bam file produced by ccs and the flnc file produced by isoseq3 cluster. ")
        sys.exit()


    if len(sys.argv)==1:
        parser.print_help()
        sys.exit()
    path_, file_prefix = os.path.split(args.outfile)
    help_functions.mkdir_p(path_)

    main(args)