示例#1
0
def _main(data,output_folder,num_threads,overwrite=False):
    data['out'] = output_folder
    datasaver = JSON_saver(create_path(data['out'],"record","json",overwrite=overwrite))
    datasaver.save(data)

    header_print("Running full CNS identification pipeline on %s alignment files" % len(data["ref_aligned_chroms"]),h_type=1)
    data['genome_beds'] = create_path(data['out']+"genome_beds",overwrite=overwrite)
    data = create_genome_beds(data,data['genome_beds'],overwrite=overwrite)

    for chromosome in sorted(data['ref_aligned_chroms'].keys()):
        header_print("Identify CNS on %s" % chromosome,h_type=2)
        chromDat = {key:data[key] for key in data if key!="ref_aligned_chroms"}
        chromDat['chrom_seq_maf'] = data['ref_aligned_chroms'][chromosome]['chrom_seq_maf']
        chromDat['chrom_conservation_wig'] = data['ref_aligned_chroms'][chromosome]['chrom_conservation_wig']
        chromDat['out'] = create_path(data['out']+"chrom/"+chromosome,overwrite=overwrite)
        chromDat = chrom_cns_identify(chromDat,chromDat['out'],num_threads,overwrite=overwrite,chrom_name=chromosome)

        data['ref_aligned_chroms'][chromosome] = {key:chromDat[key] for key in chromDat if not key.startswith("ref_")}
        datasaver.save(data)

    data = combine_cns(data,data['out'],overwrite=overwrite)
    datasaver.save(data)

    return data
示例#2
0
def _main(data,output_folder,overwrite=False):
    datasaver = JSON_saver(create_path(output_folder,"record","json",overwrite=overwrite))
    datasaver.save(data)

    header_print("Combining %s CNS files"%len(data['ref_aligned_chroms']))

    data["combined_cns"] = create_path(output_folder,"combined_identified","cns",overwrite=overwrite)
    print len(data['ref_aligned_chroms'])
    cns = Cns()
    i = 0
    for chrom in data['ref_aligned_chroms']:
        i+=1
        print i
        chrom_cns = Cns(file_name=data['ref_aligned_chroms'][chrom]['results'])
        for entry in chrom_cns.entries:
            entry.cns_ID = "%s:%s" %(chrom,str(entry.cns_ID))
            for genome in entry.sequences:
                for seq in entry.sequences[genome]:
                    seq.cns_ID = entry.cns_ID
            cns.entries.append(entry)
    cns.save_file(data["combined_cns"])
    datasaver.save(data)

    return data
示例#3
0
def _main(data, output_folder, overwrite=False):
    datasaver = JSON_saver(
        create_path(output_folder, "record", "json", overwrite=overwrite))
    datasaver.save(data)

    #gff3_to_bed
    info = "Convert coding regions to .bed:"
    header_print(info)
    data['ref_coding_bed'] = create_path(output_folder,
                                         "ref_coding",
                                         "bed",
                                         overwrite=overwrite)
    gff3_to_bed(gff3_file=data['genomes'][data['ref_genome']]['annot_gff3'],
                bed_out=data['ref_coding_bed'],
                type_list=['CDS'],
                sequence_prefix=data['ref_genome'] + ":")
    datasaver.save(data)

    #gff3_to_bed
    info = "Convert per-genome gene regions to .bed:"
    header_print(info)
    data['genome_annot_beds_folder'] = create_path(output_folder +
                                                   "genome_annot_beds",
                                                   overwrite=overwrite)
    for genome in data['genomes']:
        data['genomes'][genome]['annot_bed'] = create_path(
            data['genome_annot_beds_folder'],
            genome + "_annot",
            "bed",
            overwrite=overwrite)
        Gff3(file_name=data['genomes'][genome]['annot_gff3']) \
            .to_bed(type_list=['gene'],genome=genome) \
            .save_file(data['genomes'][genome]['annot_bed'])
    datasaver.save(data)

    return data
示例#4
0
def run(config_file,output_folder,num_threads,overwrite=False):
    config = None
    with open(config_file) as intructionJSON:
        config = json.load(intructionJSON)
    output_folder = create_path(output_folder,overwrite=overwrite)
    _main(config,output_folder,num_threads,overwrite=overwrite)
def _main(data,output_folder,num_threads,overwrite=False,chrom_name=None):
    datasaver = JSON_saver(create_path(output_folder,"record","json",overwrite=overwrite))
    datasaver.save(data)

    #maf_to_bed
    info = "Convert aligned sequences to .bed:"
    header_print(info)
    data['ref_seq_bed'] = create_path(output_folder,"ref_seq","bed",overwrite=overwrite)
    maf_to_bed(maf_file    = data['chrom_seq_maf'],
               bed_out     = data['ref_seq_bed'],
               ref_genome  = data['ref_genome'],
               index_tag   = "chrom_maf_index")
    datasaver.save(data)

    # #$bedtools intersect
    # info = "Intersect aligned regions with conserved regions:"
    # header_print(info)
    # data['conserved_bed'] = create_path(output_folder,"conserved","bed",overwrite=overwrite)
    # cmd = "bedtools intersect -a %s -b %s > %s" % (data['ref_seq_bed'],data['chrom_conserved_bed'],data['conserved_bed'])
    # print cmd
    # tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display()
    # process = subprocess.Popen(cmd, shell=True)
    # process.wait()
    # tracker.done()
    # datasaver.save(data)

    #$bedtools subtract
    info = "Subtract coding regions from aligned regions:"
    header_print(info)
    data['aligned_noncoding_bed'] = create_path(output_folder,"aligned_noncoding_bed","bed",overwrite=overwrite)
    cmd = "bedtools subtract -a %s -b %s > %s" % (data['ref_seq_bed'],data['ref_coding_bed'],data['aligned_noncoding_bed'])
    tracker = Progress_tracker("Running bedtools subtract",1).estimate(False).display()
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    tracker.done()
    datasaver.save(data)

    #wiggle_to_bed
    info = "Converting especially conserved regions in wiggle file to bed"
    header_print(info)
    data['best_conserved_bed'] = create_path(output_folder,"best_conserved","bed",overwrite=overwrite)
    wiggle_to_bed(wig_file=data['chrom_conservation_wig'],
                  out_file=data['best_conserved_bed'],
                  genome_name=data['ref_genome'])
    datasaver.save(data)

    #filter_bed_with_wiggle
    info = "Intersecting wiggle bed with the potential cns bed"
    header_print(info)
    data['cns_bed'] = create_path(output_folder,"cns","bed",overwrite=overwrite)
    cmd = "bedtools intersect -a %s -b %s > %s" % (data['aligned_noncoding_bed'],data['best_conserved_bed'],data['cns_bed'])
    tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display()
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    tracker.done()
    datasaver.save(data)

    #slice_maf_by_bed
    info = "Slice multi-alignment file based on identified conserved non-coding regions:"
    header_print(info)
    data['cns_maf'] = create_path(output_folder,"cns","maf",overwrite=overwrite)
    slice_maf_by_bed(maf_file       = data['chrom_seq_maf'],
                     bed_file       = data['cns_bed'],
                     index_tag      = "chrom_maf_index",
                     ref_genome     = data['ref_genome'],
                     out_file       = data['cns_maf'],
                     max_N_ratio    = 0.5,
                     max_gap_ratio  = 0.5,
                     min_len        = 15)
    datasaver.save(data)

    #maf_to_bed
    info = "Convert per-genome CNS regions to .bed:"
    header_print(info)
    data['genome_cns_beds_folder'] = create_path(output_folder+"genome_cns_beds",overwrite=overwrite)
    cns_maf = Maf(file_name=data['cns_maf'])
    for genome in data['genomes']:
        data['genomes'][genome]['cns_bed'] = create_path(data['genome_cns_beds_folder'],genome+"_cns_"+chrom_name,"bed",overwrite=overwrite)
        bed = cns_maf.to_bed(genome_name=genome,index_tag="cns_maf_index")
        bed.save_file(data['genomes'][genome]['cns_bed'])
    del cns_maf
    datasaver.save(data)


    #$bedtools closest
    info = "Find closest gene for each CNS region:"
    header_print(info)
    data['gene_proximity_beds_folder'] = create_path(output_folder+"gene_proximity_beds",overwrite=overwrite)
    for genome in data['genomes']:
        data['genomes'][genome]['gene_proximity_bed'] = \
            create_path(data['gene_proximity_beds_folder'],genome+"_proxim","bed",overwrite=overwrite)
        cmd = "bedtools closest -D a -a %s -b %s > %s" % \
            (data['genomes'][genome]['cns_bed'],
             data['genomes'][genome]['annot_bed'],
             data['genomes'][genome]['gene_proximity_bed'])
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
    datasaver.save(data)

    #maf_and_proxim_bed_to_cns
    info = "Process proximity and maf files into .cns file:"
    header_print(info)
    data['results'] = create_path(output_folder,"identified_CNSs","cns",overwrite=overwrite)
    cns_proxim_beds = {genome:Bed13(data['genomes'][genome]['gene_proximity_bed']) for genome in data['genomes']}
    Maf(file_name=data['cns_maf'])\
        .cns_from_proxim_beds(cns_proxim_beds,"cns_maf_index")\
        .save_file(data['results'])
    datasaver.save(data)

    return data
示例#6
0
def _main(data, output_folder, num_threads, overwrite=False, chrom_name=None):
    datasaver = JSON_saver(
        create_path(output_folder, "record", "json", overwrite=overwrite))
    datasaver.save(data)

    #maf_to_bed
    info = "Convert aligned sequences to .bed:"
    header_print(info)
    data['ref_seq_bed'] = create_path(output_folder,
                                      "ref_seq",
                                      "bed",
                                      overwrite=overwrite)
    maf_to_bed(maf_file=data['chrom_seq_maf'],
               bed_out=data['ref_seq_bed'],
               ref_genome=data['ref_genome'],
               index_tag="chrom_maf_index")
    datasaver.save(data)

    # #$bedtools intersect
    # info = "Intersect aligned regions with conserved regions:"
    # header_print(info)
    # data['conserved_bed'] = create_path(output_folder,"conserved","bed",overwrite=overwrite)
    # cmd = "bedtools intersect -a %s -b %s > %s" % (data['ref_seq_bed'],data['chrom_conserved_bed'],data['conserved_bed'])
    # print cmd
    # tracker = Progress_tracker("Running bedtools intersect",1).estimate(False).display()
    # process = subprocess.Popen(cmd, shell=True)
    # process.wait()
    # tracker.done()
    # datasaver.save(data)

    #$bedtools subtract
    info = "Subtract coding regions from aligned regions:"
    header_print(info)
    data['aligned_noncoding_bed'] = create_path(output_folder,
                                                "aligned_noncoding_bed",
                                                "bed",
                                                overwrite=overwrite)
    cmd = "bedtools subtract -a %s -b %s > %s" % (
        data['ref_seq_bed'], data['ref_coding_bed'],
        data['aligned_noncoding_bed'])
    tracker = Progress_tracker("Running bedtools subtract",
                               1).estimate(False).display()
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    tracker.done()
    datasaver.save(data)

    #wiggle_to_bed
    info = "Converting especially conserved regions in wiggle file to bed"
    header_print(info)
    data['best_conserved_bed'] = create_path(output_folder,
                                             "best_conserved",
                                             "bed",
                                             overwrite=overwrite)
    wiggle_to_bed(wig_file=data['chrom_conservation_wig'],
                  out_file=data['best_conserved_bed'],
                  genome_name=data['ref_genome'])
    datasaver.save(data)

    #filter_bed_with_wiggle
    info = "Intersecting wiggle bed with the potential cns bed"
    header_print(info)
    data['cns_bed'] = create_path(output_folder,
                                  "cns",
                                  "bed",
                                  overwrite=overwrite)
    cmd = "bedtools intersect -a %s -b %s > %s" % (
        data['aligned_noncoding_bed'], data['best_conserved_bed'],
        data['cns_bed'])
    tracker = Progress_tracker("Running bedtools intersect",
                               1).estimate(False).display()
    process = subprocess.Popen(cmd, shell=True)
    process.wait()
    tracker.done()
    datasaver.save(data)

    #slice_maf_by_bed
    info = "Slice multi-alignment file based on identified conserved non-coding regions:"
    header_print(info)
    data['cns_maf'] = create_path(output_folder,
                                  "cns",
                                  "maf",
                                  overwrite=overwrite)
    slice_maf_by_bed(maf_file=data['chrom_seq_maf'],
                     bed_file=data['cns_bed'],
                     index_tag="chrom_maf_index",
                     ref_genome=data['ref_genome'],
                     out_file=data['cns_maf'],
                     max_N_ratio=0.5,
                     max_gap_ratio=0.5,
                     min_len=15)
    datasaver.save(data)

    #maf_to_bed
    info = "Convert per-genome CNS regions to .bed:"
    header_print(info)
    data['genome_cns_beds_folder'] = create_path(output_folder +
                                                 "genome_cns_beds",
                                                 overwrite=overwrite)
    cns_maf = Maf(file_name=data['cns_maf'])
    for genome in data['genomes']:
        data['genomes'][genome]['cns_bed'] = create_path(
            data['genome_cns_beds_folder'],
            genome + "_cns_" + chrom_name,
            "bed",
            overwrite=overwrite)
        bed = cns_maf.to_bed(genome_name=genome, index_tag="cns_maf_index")
        bed.save_file(data['genomes'][genome]['cns_bed'])
    del cns_maf
    datasaver.save(data)

    #$bedtools closest
    info = "Find closest gene for each CNS region:"
    header_print(info)
    data['gene_proximity_beds_folder'] = create_path(output_folder +
                                                     "gene_proximity_beds",
                                                     overwrite=overwrite)
    for genome in data['genomes']:
        data['genomes'][genome]['gene_proximity_bed'] = \
            create_path(data['gene_proximity_beds_folder'],genome+"_proxim","bed",overwrite=overwrite)
        cmd = "bedtools closest -D a -a %s -b %s > %s" % \
            (data['genomes'][genome]['cns_bed'],
             data['genomes'][genome]['annot_bed'],
             data['genomes'][genome]['gene_proximity_bed'])
        process = subprocess.Popen(cmd, shell=True)
        process.wait()
    datasaver.save(data)

    #maf_and_proxim_bed_to_cns
    info = "Process proximity and maf files into .cns file:"
    header_print(info)
    data['results'] = create_path(output_folder,
                                  "identified_CNSs",
                                  "cns",
                                  overwrite=overwrite)
    cns_proxim_beds = {
        genome: Bed13(data['genomes'][genome]['gene_proximity_bed'])
        for genome in data['genomes']
    }
    Maf(file_name=data['cns_maf'])\
        .cns_from_proxim_beds(cns_proxim_beds,"cns_maf_index")\
        .save_file(data['results'])
    datasaver.save(data)

    return data