Exemplo n.º 1
0
def merge_srna_table(srna_file, csvs, wigs_f, wigs_r,
                     tss_file, args_srna):
    libs, texs = read_libs(args_srna.libs, args_srna.merge_wigs)
    srnas = read_gff(srna_file, "sRNA", args_srna.ex_srna)
    if tss_file is not None:
        tsss = read_gff(tss_file, "tss", args_srna.ex_srna)
    else:
        tsss = None
    inters = read_table(csvs["normal"], "inter")
    utrs = read_table(csvs["utr"], "utr")
    out = open(csvs["merge"], "w")
    for srna in srnas:
        if ("5utr" in srna.attributes["sRNA_type"]) or (
                "3utr" in srna.attributes["sRNA_type"]) or (
                "interCDS" in srna.attributes["sRNA_type"]):
            compare_table(srna, utrs, "utr", wigs_f, wigs_r,
                          texs, out, tsss, args_srna)
        elif ("intergenic" in srna.attributes["sRNA_type"]) or (
                "in_CDS" in srna.attributes["sRNA_type"]) or (
                "antisense" in srna.attributes["sRNA_type"]):
            compare_table(srna, inters, "inter", wigs_f, wigs_r,
                          texs, out, tsss, args_srna)
    out.close()
    paras = [wigs_r, wigs_f, srnas, tsss, inters, utrs]
    free_memory(paras)
Exemplo n.º 2
0
def gen_table_transcript(gff_folder, args_tran):
    '''generate the detail table of transcript'''
    libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs)
    for gff in os.listdir(gff_folder):
        if os.path.isfile(os.path.join(gff_folder, gff)):
            wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "forward.wig"])), "+", libs)
            wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "reverse.wig"])), "-", libs)
            th = open(os.path.join(gff_folder, gff), "r")
            trans = []
            out = open(os.path.join(args_tran.out_folder, "tables",
                       gff.replace(".gff", ".csv")), "w")
            out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w")
            out_gff.write("##gff-version 3\n")
            out.write("\t".join(["Genome", "Name", "Start", "End", "Strand",
                                 "Detect_lib_type", "Associated_gene",
                                 "Associated_tss", "Associated_term",
                                 "Coverage_details"]) + "\n")
            gff_parser = Gff3Parser()
            for entry in gff_parser.entries(th):
                trans.append(entry)
            print_coverage(trans, out, out_gff, wigs_f, wigs_r,
                           args_tran.table_best)
            out.close()
            out_gff.close()
            shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"),
                        os.path.join(gff_folder, gff))
    if os.path.exists(os.path.join(args_tran.out_folder, "merge_wigs")):
        shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs"))
Exemplo n.º 3
0
def gen_table_transcript(gff_folder, args_tran):
    libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs)
    for gff in os.listdir(gff_folder):
        if os.path.isfile(os.path.join(gff_folder, gff)):
            wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "forward.wig"])), "+", libs)
            wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "reverse.wig"])), "-", libs)
            th = open(os.path.join(gff_folder, gff), "r")
            trans = []
            out = open(os.path.join(args_tran.out_folder, "tables",
                       gff.replace(".gff", ".csv")), "w")
            out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w")
            out_gff.write("##gff-version 3\n")
            out.write("\t".join(["strain", "Name", "start", "end", "strand",
                                 "detect_lib_type", "associated_gene",
                                 "associated_tss", "associated_term",
                                 "coverage_details"]) + "\n")
            gff_parser = Gff3Parser()
            for entry in gff_parser.entries(th):
                trans.append(entry)
            print_coverage(trans, out, out_gff, wigs_f, wigs_r,
                           args_tran.table_best)
            out.close()
            out_gff.close()
            shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"),
                        os.path.join(gff_folder, gff))
    if os.path.exists(os.path.join(args_tran.out_folder, "merge_wigs")):
        shutil.rmtree(os.path.join(args_tran.out_folder, "merge_wigs"))
Exemplo n.º 4
0
def filter_low_expression(gff_file, args_tss, wig_f_file, wig_r_file,
                          out_file):
    '''filter the low expressed TSS'''
    tars = read_gff(gff_file)
    refs = read_gff(args_tss.manual_file)
    libs, texs = read_libs(args_tss.input_lib, args_tss.wig_folder)
    wig_fs = read_wig(wig_f_file, "+", args_tss.libs)
    wig_rs = read_wig(wig_r_file, "-", args_tss.libs)
    compare_wig(tars, wig_fs, wig_rs)
    cutoff = 1
    first = True
    while True:
        stat_value, num_ref = stat(tars, refs, cutoff, args_tss.gene_length,
                                   args_tss.cluster)
        if first:
            first = False
            best = stat_value.copy()
            continue
        else:
            best, change = change_best(num_ref, best, stat_value)
            if not change:
                break
        cutoff = cutoff + 0.1
    print_file(tars, cutoff, out_file)
    return cutoff
Exemplo n.º 5
0
def merge_srna_table(srna_file, csvs, wig_f_file, wig_r_file,
                     tss_file, args_srna):
    libs, texs = read_libs(args_srna.libs, args_srna.merge_wigs)
    wigs_f = read_wig(wig_f_file, "+", libs)
    wigs_r = read_wig(wig_r_file, "-", libs)
    srnas = read_gff(srna_file, "sRNA")
    if tss_file is not None:
        tsss = read_gff(tss_file, "tss")
    else:
        tsss = None
    inters = read_table(csvs["normal"], "inter")
    utrs = read_table(csvs["utr"], "utr")
    out = open(csvs["merge"], "w")
    for srna in srnas:
        if (srna.attributes["sRNA_type"] == "5utr") or (
                srna.attributes["sRNA_type"] == "3utr") or (
                srna.attributes["sRNA_type"] == "interCDS"):
            compare_table(srna, utrs, "utr", wigs_f, wigs_r,
                          texs, out, tsss, args_srna)
        elif (srna.attributes["sRNA_type"] == "intergenic") or (
                srna.attributes["sRNA_type"] == "in_CDS") or (
                srna.attributes["sRNA_type"] == "antisense"):
            compare_table(srna, inters, "inter", wigs_f, wigs_r,
                          texs, out, tsss, args_srna)
    out.close()
    paras = [wigs_r, wigs_f, srnas, tsss, inters, utrs]
    free_memory(paras)
Exemplo n.º 6
0
def utr_derived_srna(args_srna):
    inters = []
    cdss, tas, tsss, pros, seq = read_data(args_srna)
    libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
    wig_fs = read_wig(args_srna.wig_f_file, "+", libs)
    wig_rs = read_wig(args_srna.wig_r_file, "-", libs)
    out = open(args_srna.output_file, "w")
    out.write("##gff-version 3\n")
    out_t = open(args_srna.output_table, "w")
    get_terminal(cdss, inters, seq, "start")
    get_inter(cdss, inters)
    get_terminal(cdss, inters, seq, "end")
    inters = sorted(inters, key=lambda k: (k["strain"], k["start"],
                                           k["end"], k["strand"]))
    args_srna = ArgsContainer().extend_utr_container(
                            args_srna, cdss, tsss, pros, wig_fs, wig_rs, out,
                            out_t, texs)
    for inter in inters:
        for ta in tas:
            if (inter["strain"] == ta.seq_id) and (
                    inter["strand"] == ta.strand):
                class_utr(inter, ta, args_srna)
    covers = get_utr_coverage(args_srna.utrs)
    mediandict = set_cutoff(covers, args_srna)
    print_median(args_srna.out_folder, mediandict)
    detect_srna(mediandict, args_srna)
    args_srna.out.close()
    args_srna.out_t.close()
    paras = [wig_fs, wig_rs, args_srna.srnas, args_srna.utrs,
             args_srna.wig_fs, args_srna.wig_rs, seq, inters,
             tas, cdss, tas, tsss, pros, covers]
    free_memory(paras)
Exemplo n.º 7
0
def filter_low_expression(gff_file, args_tss, wig_f_file,
                          wig_r_file, out_file):
    '''filter the low expressed TSS'''
    tars = read_gff(gff_file)
    refs = read_gff(args_tss.manual_file)
    libs, texs = read_libs(args_tss.input_lib, args_tss.wig_folder)
    wig_fs = read_wig(wig_f_file, "+", args_tss.libs)
    wig_rs = read_wig(wig_r_file, "-", args_tss.libs)
    compare_wig(tars, wig_fs, wig_rs)
    cutoff = 1
    first = True
    while True:
        stat_value, num_ref = stat(tars, refs, cutoff,
                                   args_tss.gene_length, args_tss.cluster)
        if first:
            first = False
            best = stat_value.copy()
            continue
        else:
            best, change = change_best(num_ref, best, stat_value)
            if not change:
                break
        cutoff = cutoff + 0.1
    print_file(tars, cutoff, out_file)
    return cutoff
Exemplo n.º 8
0
def utr_derived_srna(args_srna):
    inters = []
    cdss, tas, tsss, pros, seq = read_data(args_srna)
    libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
    wig_fs = read_wig(args_srna.wig_f_file, "+", libs)
    wig_rs = read_wig(args_srna.wig_r_file, "-", libs)
    out = open(args_srna.output_file, "w")
    out.write("##gff-version 3\n")
    out_t = open(args_srna.output_table, "w")
    get_terminal(cdss, inters, seq, "start")
    get_inter(cdss, inters)
    get_terminal(cdss, inters, seq, "end")
    inters = sorted(inters,
                    key=lambda k:
                    (k["strain"], k["start"], k["end"], k["strand"]))
    args_srna = ArgsContainer().extend_utr_container(args_srna, cdss, tsss,
                                                     pros, wig_fs, wig_rs, out,
                                                     out_t, texs)
    for inter in inters:
        for ta in tas:
            if (inter["strain"] == ta.seq_id) and (inter["strand"]
                                                   == ta.strand):
                class_utr(inter, ta, args_srna)
    covers = get_utr_coverage(args_srna.utrs)
    mediandict = set_cutoff(covers, args_srna)
    print_median(args_srna.out_folder, mediandict)
    detect_srna(mediandict, args_srna)
    args_srna.out.close()
    args_srna.out_t.close()
    paras = [
        wig_fs, wig_rs, args_srna.srnas, args_srna.utrs, args_srna.wig_fs,
        args_srna.wig_rs, seq, inters, tas, cdss, tas, tsss, pros, covers
    ]
    free_memory(paras)
Exemplo n.º 9
0
def upstream(tss_file, fasta_file, gff_file, out_class, args_pro, prefix):
    '''get the upstream sequence of TSS'''
    if fasta_file is not None:
        files = {
            "pri": open("tmp/primary.fa", "w"),
            "sec": open("tmp/secondary.fa", "w"),
            "inter": open("tmp/internal.fa", "w"),
            "anti": open("tmp/antisense.fa", "w"),
            "orph": open("tmp/orphan.fa", "w")
        }
    tsss, seq = read_data(tss_file, fasta_file)
    num_tss = 0
    if not args_pro.source:
        out = open(out_class, "w")
        out.write("##gff-version 3\n")
        cdss, genes = read_gff(gff_file)
    for tss in tsss:
        if ("type" not in tss.attributes.keys()) and (args_pro.source):
            print("Error: The TSS gff file may not generated from ANNOgesic."
                  "Please run with --tss_source!")
            sys.exit()
        if args_pro.source:
            name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id])
            print_fasta(seq, tss, files, name, args_pro.nt_before)
        else:
            tss_type = compare_tss_cds(tss, cdss, genes)
            tss.attributes = tss_type[1]
            tss.attributes["ID"] = tss.seq_id + "_tss" + str(num_tss)
            tss.attribute_string = "".join(
                [tss_type[0], ";ID=", tss.seq_id, "_tss",
                 str(num_tss)])
            num_tss += 1
    if not args_pro.source:
        if args_pro.tex_wigs is not None:
            libs, texs = read_libs(args_pro.input_libs, args_pro.tex_wigs)
            wigs_f = read_wig(
                os.path.join(args_pro.wig_path, prefix + "_forward.wig"), "+",
                libs)
            wigs_r = read_wig(
                os.path.join(args_pro.wig_path, prefix + "_reverse.wig"), "+",
                libs)
        else:
            wigs_f = None
            wigs_r = None
        sort_tsss = sorted(tsss,
                           key=lambda k: (k.seq_id, k.start, k.end, k.strand))
        final_tsss = fix_primary_type(sort_tsss, wigs_f, wigs_r)
        for tss in final_tsss:
            name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id])
            tss.attribute_string = ";".join(
                ["=".join(items) for items in tss.attributes.items()])
            out.write("\t".join([
                str(field) for field in [
                    tss.seq_id, tss.source, tss.feature, tss.start, tss.end,
                    tss.score, tss.strand, tss.phase, tss.attribute_string
                ]
            ]) + "\n")
            if fasta_file is not None:
                print_fasta(seq, tss, files, name, args_pro.nt_before)
Exemplo n.º 10
0
def gene_expression(input_libs, gff_folder, percent_tex, percent_frag,
                    wig_f_file, wig_r_file, features, wigs, cutoff_coverage,
                    tex_notex, replicates, stat_folder, out_gff_folder,
                    cover_type, max_color, min_color):
    print("Loading wiggle file...")
    libs, texs = read_libs(input_libs, wigs)
    wig_fs = read_wig(wig_f_file, "+", libs)
    wig_rs = read_wig(wig_r_file, "-", libs)
    plots = {}
    repeat = {}
    for gff in os.listdir(gff_folder):
        if gff.endswith(".gff"):
            prefix = gff.replace(".gff", "")
            print("Computing " + prefix)
            gff_list, stats, outs = read_data(os.path.join(gff_folder, gff),
                                              features)
            for feature, gffs in gff_list.items():
                plots[feature] = []
                repeat[feature] = {}
                tags = []
                stats[feature]["total"] = {
                    "total": 0,
                    "least_one": 0,
                    "all": 0,
                    "none": 0
                }
                num = 0
                for gff in gffs:
                    if gff.seq_id not in stats[feature].keys():
                        stats[feature][gff.seq_id] = {
                            "total": 0,
                            "least_one": 0,
                            "all": 0,
                            "none": 0
                        }
                    stats[feature]["total"]["total"] += 1
                    stats[feature][gff.seq_id]["total"] += 1
                    name = get_name(plots, gff, feature, repeat[feature], tags)
                    if gff.strand == "+":
                        compare_wigs(wig_fs, gff, tex_notex, texs, replicates,
                                     stats[feature], outs[feature],
                                     plots[feature][num][name], cover_type,
                                     cutoff_coverage, percent_tex,
                                     percent_frag)
                    elif gff.strand == "-":
                        compare_wigs(wig_rs, gff, tex_notex, texs, replicates,
                                     stats[feature], outs[feature],
                                     plots[feature][num][name], cover_type,
                                     cutoff_coverage, percent_tex,
                                     percent_frag)
                    num += 1
            output_stat(stats, stat_folder, prefix)
            output_gff(outs, out_gff_folder, prefix)
    plot(plots, stat_folder, max_color, min_color, cover_type)
Exemplo n.º 11
0
def upstream(tss_file, fasta_file, gff_file, out_class, args_pro, prefix):
    '''get the upstream sequence of TSS'''
    if fasta_file is not None:
        files = {"pri": open("tmp/primary.fa", "w"),
                 "sec": open("tmp/secondary.fa", "w"),
                 "inter": open("tmp/internal.fa", "w"),
                 "anti": open("tmp/antisense.fa", "w"),
                 "orph": open("tmp/orphan.fa", "w")}
    tsss, seq = read_data(tss_file, fasta_file)
    num_tss = 0
    if not args_pro.source:
        out = open(out_class, "w")
        out.write("##gff-version 3\n")
        cdss, genes = read_gff(gff_file)
    for tss in tsss:
        if ("type" not in tss.attributes.keys()) and (args_pro.source):
            print("Error: The TSS gff file may not generated from ANNOgesic."
                  "Please run with --tss_source!")
            sys.exit()
        if args_pro.source:
            name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id])
            print_fasta(seq, tss, files, name, args_pro.nt_before)
        else:
            tss_type = compare_tss_cds(tss, cdss, genes)
            tss.attributes = tss_type[1]
            tss.attributes["ID"] = tss.seq_id + "_tss" + str(num_tss)
            tss.attribute_string = "".join([
                tss_type[0], ";ID=", tss.seq_id, "_tss", str(num_tss)])
            num_tss += 1
    if not args_pro.source:
        if args_pro.tex_wigs is not None:
            libs, texs = read_libs(args_pro.input_libs, args_pro.tex_wigs)
            wigs_f = read_wig(os.path.join(
                args_pro.wig_path, prefix + "_forward.wig"), "+", libs)
            wigs_r = read_wig(os.path.join(
                args_pro.wig_path, prefix + "_reverse.wig"), "+", libs)
        else:
            wigs_f = None
            wigs_r = None
        sort_tsss = sorted(tsss, key=lambda k: (k.seq_id, k.start,
                                                k.end, k.strand))
        final_tsss = fix_primary_type(sort_tsss, wigs_f, wigs_r)
        for tss in final_tsss:
            name = ">" + "_".join([str(tss.start), tss.strand, tss.seq_id])
            tss.attribute_string = ";".join(
                ["=".join(items) for items in tss.attributes.items()])
            out.write("\t".join([str(field) for field in [
                            tss.seq_id, tss.source, tss.feature, tss.start,
                            tss.end, tss.score, tss.strand, tss.phase,
                            tss.attribute_string]]) + "\n")
            if fasta_file is not None:
                print_fasta(seq, tss, files, name, args_pro.nt_before)
Exemplo n.º 12
0
def detect_coverage(term_table, gff_file, tran_file, seq_file, wig_f_file,
                    wig_r_file, tranterm_file, wig_folder, output_file,
                    output_table, args_term):
    gffs, tas, hps, fr_terms, seq = read_data(gff_file, tran_file,
                                              tranterm_file, seq_file,
                                              term_table)
    terms = compare_transtermhp(hps, fr_terms)
    compare_ta(terms, tas, args_term.fuzzy)
    libs, texs = read_libs(args_term.libs, wig_folder)
    compute_wig(wig_f_file, libs, terms, "+", texs, args_term)
    compute_wig(wig_r_file, libs, terms, "-", texs, args_term)
    out = open(output_file, "w")
    out_t = open(output_table, "w")
    print_term(terms, out, out_t, args_term)
Exemplo n.º 13
0
def detect_coverage(term_table, gff_file, tran_file, seq_file,
                    wig_f_file, wig_r_file, tranterm_file, wig_folder,
                    output_file, output_table, args_term):
    gffs, tas, hps, fr_terms, seq = read_data(gff_file, tran_file,
                                              tranterm_file, seq_file,
                                              term_table)
    terms = compare_transtermhp(hps, fr_terms)
    compare_ta(terms, tas, args_term.fuzzy)
    libs, texs = read_libs(args_term.libs, wig_folder)
    compute_wig(wig_f_file, libs, terms, "+", texs, args_term)
    compute_wig(wig_r_file, libs, terms, "-", texs, args_term)
    out = open(output_file, "w")
    out_t = open(output_table, "w")
    print_term(terms, out, out_t, args_term)
Exemplo n.º 14
0
def assembly(wig_f_file, wig_r_file, wig_folder, input_lib,
             out_file, wig_type, args_tran):
    out = open(out_file, "w")
    out.write("##gff-version 3\n")
    libs, texs = read_libs(input_lib, wig_folder)
    wig_fs = read_wig(wig_f_file, "+", libs)
    wig_rs = read_wig(wig_r_file, "-", libs)
    tolers_f, tran_fs = transfer_to_tran(wig_fs, libs, texs, "+", args_tran)
    tolers_r, tran_rs = transfer_to_tran(wig_rs, libs, texs, "-", args_tran)
    fill_gap_and_print(tran_fs, "+", out, tolers_f, wig_type, args_tran)
    fill_gap_and_print(tran_rs, "-", out, tolers_r, wig_type, args_tran)
    out.close()
    del wig_fs
    del wig_rs
Exemplo n.º 15
0
def sorf_detection(fasta, srna_gff, inter_gff, tss_file, wig_f_file,
                   wig_r_file, out_prefix, args_sorf):
    coverages = set_coverage(args_sorf)
    libs, texs = read_libs(args_sorf.libs, args_sorf.merge_wigs)
    inters, tsss, srnas, seq = read_data(inter_gff, tss_file, srna_gff, fasta,
                                         args_sorf.utr_detect)
    wigs = {
        "forward": read_wig(wig_f_file, "+", libs),
        "reverse": read_wig(wig_r_file, "-", libs)
    }
    med_inters = detect_inter_type(inters, wigs, args_sorf.background)
    inter_covers = {}
    mediandict = {}
    for strain, meds in med_inters.items():
        inter_covers[strain] = {"5utr": {}, "3utr": {}, "interCDS": {}}
        for type_, covers in meds.items():
            get_inter_coverage(covers, inter_covers[strain][type_])
    set_median(inter_covers, mediandict, coverages)
    out_ag = open("_".join([out_prefix, "all.gff"]), "w")
    out_at = open("_".join([out_prefix, "all.csv"]), "w")
    out_bg = open("_".join([out_prefix, "best.gff"]), "w")
    out_bt = open("_".join([out_prefix, "best.csv"]), "w")
    sorfs = detect_start_stop(inters, seq, args_sorf)
    sorfs_all, sorfs_best = compare_sorf_tss(sorfs, tsss, tss_file, args_sorf)
    compare_sorf_srna(sorfs_all, srnas, srna_gff)
    compare_sorf_srna(sorfs_best, srnas, srna_gff)
    sorfs_all = sorted(sorfs_all,
                       key=lambda k:
                       (k["strain"], k["start"], k["end"], k["strand"]))
    sorfs_best = sorted(sorfs_best,
                        key=lambda k:
                        (k["strain"], k["start"], k["end"], k["strand"]))
    final_all = coverage_and_output(sorfs_all, mediandict, wigs, out_ag,
                                    out_at, "all", seq, coverages, args_sorf,
                                    texs, "first")
    final_best = coverage_and_output(sorfs_best, mediandict, wigs, out_bg,
                                     out_bt, "best", seq, coverages, args_sorf,
                                     texs, "first")
    final_all = merge(final_all, seq)
    final_best = merge(final_best, seq)
    final_best = get_best(final_best, tss_file, srna_gff, args_sorf)
    coverage_and_output(final_all, mediandict, wigs, out_ag, out_at, "all",
                        seq, coverages, args_sorf, texs, "final")
    coverage_and_output(final_best, mediandict, wigs, out_bg, out_bt, "best",
                        seq, coverages, args_sorf, texs, "final")
    out_ag.close()
    out_at.close()
    out_bg.close()
    out_bt.close()
Exemplo n.º 16
0
def detect_transcript(wig_f_file, wig_r_file, wig_folder, input_lib,
                      out_file, wig_type, args_tran):
    out = open(out_file, "w")
    out.write("##gff-version 3\n")
    finals = {}
    libs, texs = read_libs(input_lib, wig_folder)
    wig_fs = read_wig(wig_f_file, "+", libs)
    wig_rs = read_wig(wig_r_file, "-", libs)
    tolers_f, tran_fs = transfer_to_tran(wig_fs, libs, texs, "+", args_tran)
    tolers_r, tran_rs = transfer_to_tran(wig_rs, libs, texs, "-", args_tran)
    fill_gap_and_print(tran_fs, "+", finals, tolers_f, wig_type, args_tran)
    fill_gap_and_print(tran_rs, "-", finals, tolers_r, wig_type, args_tran)
    print_transcript(finals, out)
    out.close()
    del wig_fs
    del wig_rs
Exemplo n.º 17
0
def gene_expression(input_libs, gff_folder, percent_tex, percent_frag,
                    wig_f_file, wig_r_file, features, wigs, cutoff_coverage,
                    tex_notex, replicates, stat_folder, out_gff_folder,
                    cover_type, max_color, min_color):
    print("Loading wiggle file...")
    libs, texs = read_libs(input_libs, wigs)
    wig_fs = read_wig(wig_f_file, "+", libs)
    wig_rs = read_wig(wig_r_file, "-", libs)
    plots = {}
    repeat = {}
    for gff in os.listdir(gff_folder):
        if gff.endswith(".gff"):
            prefix = gff.replace(".gff", "")
            print("Computing " + prefix)
            gff_list, stats, outs = read_data(os.path.join(gff_folder, gff),
                                              features)
            for feature, gffs in gff_list.items():
                plots[feature] = []
                repeat[feature] = {}
                tags = []
                stats[feature]["total"] = {"total": 0, "least_one": 0,
                                           "all": 0, "none": 0}
                num = 0
                for gff in gffs:
                    if gff.seq_id not in stats[feature].keys():
                        stats[feature][gff.seq_id] = {
                                "total": 0, "least_one": 0,
                                "all": 0, "none": 0}
                    stats[feature]["total"]["total"] += 1
                    stats[feature][gff.seq_id]["total"] += 1
                    name = get_name(plots, gff, feature, repeat[feature], tags)
                    if gff.strand == "+":
                        compare_wigs(
                                wig_fs, gff, tex_notex, texs, replicates,
                                stats[feature], outs[feature],
                                plots[feature][num][name], cover_type,
                                cutoff_coverage, percent_tex, percent_frag)
                    elif gff.strand == "-":
                        compare_wigs(
                                wig_rs, gff, tex_notex, texs, replicates,
                                stats[feature], outs[feature],
                                plots[feature][num][name], cover_type,
                                cutoff_coverage, percent_tex, percent_frag)
                    num += 1
            output_stat(stats, stat_folder, prefix)
            output_gff(outs, out_gff_folder, prefix)
    plot(plots, stat_folder, max_color, min_color, cover_type)
Exemplo n.º 18
0
def sorf_detection(fasta, srna_gff, inter_gff, tss_file, wig_f_file,
                   wig_r_file, out_prefix, args_sorf):
    coverages = set_coverage(args_sorf)
    libs, texs = read_libs(args_sorf.libs, args_sorf.merge_wigs)
    inters, tsss, srnas, seq = read_data(inter_gff, tss_file, srna_gff,
                                         fasta, args_sorf.utr_detect)
    wigs = {"forward": read_wig(wig_f_file, "+", libs),
            "reverse": read_wig(wig_r_file, "-", libs)}
    med_inters = detect_inter_type(inters, wigs, args_sorf.background)
    inter_covers = {}
    mediandict = {}
    for strain, meds in med_inters.items():
        inter_covers[strain] = {"5utr": {}, "3utr": {}, "interCDS": {}}
        for type_, covers in meds.items():
            get_inter_coverage(covers, inter_covers[strain][type_])
    set_median(inter_covers, mediandict, coverages)
    out_ag = open("_".join([out_prefix, "all.gff"]), "w")
    out_at = open("_".join([out_prefix, "all.csv"]), "w")
    out_bg = open("_".join([out_prefix, "best.gff"]), "w")
    out_bt = open("_".join([out_prefix, "best.csv"]), "w")
    sorfs = detect_start_stop(inters, seq, args_sorf)
    sorfs_all, sorfs_best = compare_sorf_tss(sorfs, tsss, tss_file, args_sorf)
    compare_sorf_srna(sorfs_all, srnas, srna_gff)
    compare_sorf_srna(sorfs_best, srnas, srna_gff)
    sorfs_all = sorted(sorfs_all, key=lambda k: (k["strain"], k["start"],
                                                 k["end"], k["strand"]))
    sorfs_best = sorted(sorfs_best, key=lambda k: (k["strain"], k["start"],
                                                   k["end"], k["strand"]))
    final_all = coverage_and_output(
                    sorfs_all, mediandict, wigs, out_ag, out_at,
                    "all", seq, coverages, args_sorf, texs, "first")
    final_best = coverage_and_output(
                    sorfs_best, mediandict, wigs, out_bg, out_bt,
                    "best", seq, coverages, args_sorf, texs, "first")
    final_all = merge(final_all, seq)
    final_best = merge(final_best, seq)
    final_best = get_best(final_best, tss_file, srna_gff, args_sorf)
    coverage_and_output(final_all, mediandict, wigs, out_ag, out_at,
                        "all", seq, coverages, args_sorf, texs, "final")
    coverage_and_output(final_best, mediandict, wigs, out_bg, out_bt,
                        "best", seq, coverages, args_sorf, texs, "final")
    out_ag.close()
    out_at.close()
    out_bg.close()
    out_bt.close()
Exemplo n.º 19
0
def reorganize_table(input_libs, wigs, cover_header, table_file):
    libs, texs = read_libs(input_libs, wigs)
    fh = open(table_file, "r")
    first = True
    headers = []
    tracks, track_list = get_lib_name(libs)
    out = open(table_file + "tmp", "w")
    for row in csv.reader(fh, delimiter='\t'):
        if first:
            detect = False
            header_num = 0
            for header in row:
                if header == cover_header:
                    index = header_num
                    detect = True
                header_num += 1
                if not detect:
                    headers.append(header)
                else:
                    detect = False
            first = False
            for track in tracks:
                headers.append("Avg_coverage:" + track)
            out.write("\t".join(headers) + "\n")
        else:
            if len(row) < (index + 1):
                cover_names = []
                covers = []
            else:
                cover_names, covers = import_covers(row[index])
            if len(row) == index + 1:
                row = row[:index]
            else:
                row = row[:index] + row[index + 1:]
            detects = ["Not_detect"] * len(tracks)
            for name, cover in zip(cover_names, covers):
                num_track = 0
                for track in track_list:
                    if name in track:
                        detects[num_track] = cover
                    num_track += 1
            out.write("\t".join(row + detects) + "\n")
    out.close()
    shutil.move(table_file + "tmp", table_file)
Exemplo n.º 20
0
def reorganize_table(input_libs, wigs, cover_header, table_file):
    libs, texs = read_libs(input_libs, wigs)
    fh = open(table_file, "r")
    first = True
    headers = []
    tracks, track_list = get_lib_name(libs)
    out = open(table_file + "tmp", "w")
    for row in csv.reader(fh, delimiter='\t'):
        if first:
            detect = False
            header_num = 0
            for header in row:
                if header == cover_header:
                    index = header_num
                    detect = True
                header_num += 1
                if not detect:
                   headers.append(header)
                else:
                   detect = False
            first = False
            for track in tracks:
                headers.append("Avg_coverage:" + track)
            out.write("\t".join(headers) + "\n")
        else:
            if len(row) < (index + 1):
                cover_names = []
                covers = []
            else:
                cover_names, covers = import_covers(row[index])
            if len(row) == index + 1:
                row = row[:index]
            else:
                row = row[:index] + row[index + 1:]
            detects = ["Not_detect"] * len(tracks)
            for name, cover in zip(cover_names, covers):
                num_track = 0
                for track in track_list:
                    if name in track:
                        detects[num_track] = cover
                    num_track += 1
            out.write("\t".join(row + detects) + "\n")
    out.close()
    shutil.move(table_file + "tmp", table_file)
Exemplo n.º 21
0
def intergenic_srna(args_srna):
    inter_cutoff_coverage, inter_notex = get_intergenic_antisense_cutoff(
        args_srna)
    anti_cutoff_coverage, anti_notex = get_intergenic_antisense_cutoff(
        args_srna)
    libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
    wigs_f = read_wig(args_srna.wig_f_file, "+", libs)
    wigs_r = read_wig(args_srna.wig_r_file, "-", libs)
    nums, cdss, tas, pros, genes = read_data(args_srna)
    if not args_srna.tss_source:
        compute_tss_type(args_srna, cdss, genes, wigs_f, wigs_r)
    tsss, num_tss = read_tss(args_srna.tss_file)
    detects = {"overlap": False, "uni_with_tss": False, "anti": False}
    output = open(args_srna.output_file, "w")
    out_table = open(args_srna.output_table, "w")
    output.write("##gff-version 3\n")
    for ta in tas:
        detects["overlap"] = False
        detects["anti"] = False
        compare_ta_cds(cdss, ta, detects)
        if (detects["overlap"]) and (not args_srna.in_cds):
            continue
        else:
            if not detects["anti"]:
                cutoff_coverage = inter_cutoff_coverage
                notex = inter_notex
            else:
                cutoff_coverage = anti_cutoff_coverage
                notex = anti_notex
            args_srna = ArgsContainer().extend_inter_container(
                args_srna, tsss, pros, wigs_f, wigs_r, nums, output, out_table,
                texs, detects, cutoff_coverage, notex)
            check_srna_condition(ta, args_srna)
    file_name = args_srna.output_file.split(".")
    file_name = file_name[0] + ".stat"
    output.close()
    out_table.close()
    paras = [
        wigs_f, wigs_r, tsss, tas, pros, genes, cdss, args_srna.wigs_f,
        args_srna.wigs_r
    ]
    free_memory(paras)
Exemplo n.º 22
0
def intergenic_srna(args_srna):
    inter_cutoff_coverage, inter_notex = get_intergenic_antisense_cutoff(
                                         args_srna)
    anti_cutoff_coverage, anti_notex = get_intergenic_antisense_cutoff(
                                       args_srna)
    libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
    wigs_f = read_wig(args_srna.wig_f_file, "+", libs)
    wigs_r = read_wig(args_srna.wig_r_file, "-", libs)
    nums, cdss, tas, pros, genes = read_data(args_srna)
    if not args_srna.tss_source:
        compute_tss_type(args_srna, cdss, genes, wigs_f, wigs_r)
    tsss, num_tss = read_tss(args_srna.tss_file)
    detects = {"overlap": False, "uni_with_tss": False, "anti": False}
    output = open(args_srna.output_file, "w")
    out_table = open(args_srna.output_table, "w")
    output.write("##gff-version 3\n")
    for ta in tas:
        detects["overlap"] = False
        detects["anti"] = False
        compare_ta_cds(cdss, ta, detects)
        if (detects["overlap"]) and (not args_srna.in_cds):
            continue
        else:
            if not detects["anti"]:
                cutoff_coverage = inter_cutoff_coverage
                notex = inter_notex
            else:
                cutoff_coverage = anti_cutoff_coverage
                notex = anti_notex
            args_srna = ArgsContainer().extend_inter_container(
                            args_srna, tsss, pros, wigs_f,
                            wigs_r, nums, output, out_table, texs, detects,
                            cutoff_coverage, notex)
            check_srna_condition(ta, args_srna)
    file_name = args_srna.output_file.split(".")
    file_name = file_name[0] + ".stat"
    output.close()
    out_table.close()
    paras = [wigs_f, wigs_r, tsss, tas, pros, genes, cdss,
             args_srna.wigs_f, args_srna.wigs_r]
    free_memory(paras)
Exemplo n.º 23
0
def gen_table_transcript(gff_folder, args_tran):
    '''generate the detail table of transcript'''
    libs, texs = read_libs(args_tran.libs, args_tran.merge_wigs)
    for gff in os.listdir(gff_folder):
        if os.path.isfile(os.path.join(gff_folder, gff)):
            wigs_f = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "forward.wig"])), "+", libs)
            wigs_r = read_wig(os.path.join(args_tran.wig_path, "_".join([
                              gff.replace("_transcript.gff", ""),
                              "reverse.wig"])), "-", libs)
            th = open(os.path.join(gff_folder, gff), "r")
            trans = []
            out = open(os.path.join(args_tran.out_folder, "tables",
                       gff.replace(".gff", ".csv")), "w")
            out_gff = open(os.path.join(args_tran.out_folder, "tmp_gff"), "w")
            out_gff.write("##gff-version 3\n")
            out.write("\t".join(["Genome", "Name", "Start", "End", "Strand",
                                 "Detect_lib_type", "Associated_gene",
                                 "Associated_tss", "Associated_term",
                                 "Coverage_details"]) + "\n")
            gff_parser = Gff3Parser()
            for entry in gff_parser.entries(th):
                trans.append(entry)
            if args_tran.gffs is not None:
                gff_file = os.path.join(args_tran.gffs,
                                        gff.replace("_transcript", ""))
                if not os.path.isfile(gff_file):
                    gff_file = None
            else:
                gff_file = None
            print_coverage(trans, out, out_gff, wigs_f, wigs_r, gff_file)
            out.close()
            out_gff.close()
            shutil.move(os.path.join(args_tran.out_folder, "tmp_gff"),
                        os.path.join(gff_folder, gff))
Exemplo n.º 24
0
 def _read_lib_wig(self, args_srna):
     libs, texs = read_libs(args_srna.input_libs, args_srna.wig_folder)
     wigs_f = read_wig(args_srna.wig_f_file, "+", libs)
     wigs_r = read_wig(args_srna.wig_r_file, "-", libs)
     return [libs, texs, wigs_f, wigs_r]